blob: e39e69c6fbbca2e2fa515d8dc1d91bb1cfba3262 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/*!
* \file bounding_box.cu
* \brief Bounding box util functions and operators
* \author Joshua Zhang
*/
#include <cub/cub.cuh>
#include "./bounding_box-inl.cuh"
#include "./bounding_box-inl.h"
#include "../elemwise_op_common.h"
namespace mxnet {
namespace op {
namespace {
using mshadow::Stream;
using mshadow::Tensor;
template <typename DType>
struct TempWorkspace {
size_t scores_temp_space;
DType* scores;
size_t scratch_space;
uint8_t* scratch;
size_t buffer_space;
DType* buffer;
size_t nms_scratch_space;
uint32_t* nms_scratch;
size_t indices_temp_spaces;
index_t* indices;
};
inline size_t ceil_div(size_t x, size_t y) {
return (x + y - 1) / y;
}
inline size_t align(size_t x, size_t alignment) {
return ceil_div(x, alignment) * alignment;
}
template <typename DType>
__global__ void FilterAndPrepareAuxDataKernel(const DType* data,
DType* out,
DType* scores,
index_t num_elements_per_batch,
const index_t element_width,
const index_t N,
const float threshold,
const int id_index,
const int score_index,
const int background_id) {
index_t tid = blockIdx.x * blockDim.x + threadIdx.x;
bool first_in_element = (tid % element_width == 0);
index_t start_of_my_element = tid - (tid % element_width);
if (tid < N) {
DType my_score = data[start_of_my_element + score_index];
bool filtered_out = my_score <= threshold;
if (id_index != -1 && background_id != -1) {
DType my_id = data[start_of_my_element + id_index];
filtered_out = filtered_out || (my_id == background_id);
}
if (!filtered_out) {
out[tid] = data[tid];
} else {
out[tid] = -1;
my_score = -1;
}
if (first_in_element) {
index_t offset = tid / element_width;
scores[offset] = my_score;
}
}
}
template <typename DType>
void FilterAndPrepareAuxData(const Tensor<gpu, 3, DType>& data,
Tensor<gpu, 3, DType>* out,
const TempWorkspace<DType>& workspace,
const BoxNMSParam& param,
Stream<gpu>* s) {
const int n_threads = 512;
index_t N = data.shape_.Size();
const auto blocks = ceil_div(N, n_threads);
FilterAndPrepareAuxDataKernel<<<blocks, n_threads, 0, Stream<gpu>::GetStream(s)>>>(
data.dptr_,
out->dptr_,
workspace.scores,
data.shape_[1],
data.shape_[2],
N,
param.valid_thresh,
param.id_index,
param.score_index,
param.background_id);
}
template <bool check_topk, bool check_score, typename DType>
__global__ void CompactDataKernel(const index_t* indices,
const DType* source,
DType* destination,
const index_t topk,
const index_t element_width,
const index_t num_elements_per_batch,
const int score_index,
const index_t N) {
const index_t tid_start = blockIdx.x * blockDim.x + threadIdx.x;
for (index_t tid = tid_start; tid < N; tid += blockDim.x * gridDim.x) {
const index_t my_element = tid / element_width;
const index_t my_element_in_batch = my_element % num_elements_per_batch;
if (check_topk && my_element_in_batch >= topk) {
destination[tid] = -1;
} else {
DType ret;
const index_t source_element = indices[my_element];
DType score = 0;
if (check_score) {
score = source[source_element * element_width + score_index];
}
if (score >= 0) {
ret = source[source_element * element_width + tid % element_width];
} else {
ret = -1;
}
destination[tid] = ret;
}
}
}
template <bool check_score, typename DType>
void CompactData(const Tensor<gpu, 1, index_t>& indices,
const Tensor<gpu, 3, DType>& source,
Tensor<gpu, 3, DType>* destination,
const index_t topk,
const int score_index,
Stream<gpu>* s) {
const int n_threads = 512;
const size_t max_blocks = 320;
index_t N = source.shape_.Size();
const auto blocks = std::min(ceil_div(N, n_threads), max_blocks);
if (topk > 0) {
CompactDataKernel<true, check_score>
<<<blocks, n_threads, 0, Stream<gpu>::GetStream(s)>>>(indices.dptr_,
source.dptr_,
destination->dptr_,
topk,
source.shape_[2],
source.shape_[1],
score_index,
N);
} else {
CompactDataKernel<false, check_score>
<<<blocks, n_threads, 0, Stream<gpu>::GetStream(s)>>>(indices.dptr_,
source.dptr_,
destination->dptr_,
topk,
source.shape_[2],
source.shape_[1],
score_index,
N);
}
}
template <typename DType>
void WorkspaceForSort(const index_t num_elem,
const index_t topk,
const int alignment,
TempWorkspace<DType>* workspace) {
const size_t sort_scores_temp_space =
mxnet::op::SortByKeyWorkspaceSize<DType, index_t, gpu>(num_elem, false, false);
const size_t sort_topk_scores_temp_space =
mxnet::op::SortByKeyWorkspaceSize<DType, index_t, gpu>(topk, false, false);
workspace->scratch_space =
align(std::max(sort_scores_temp_space, sort_topk_scores_temp_space), alignment);
}
template <int encode, typename DType>
__global__ void CalculateGreedyNMSResultsKernel(const DType* data,
uint32_t* result,
const index_t current_start,
const index_t num_elems,
const index_t num_batches,
const index_t num_blocks_per_row_batch,
const index_t num_blocks_per_row,
const index_t topk,
const index_t element_width,
const index_t num_elements_per_batch,
const int coord_index,
const int class_index,
const int score_index,
const float threshold);
template <typename DType>
__global__ void ReduceNMSResultTriangleKernel(uint32_t* nms_results,
DType* data,
const index_t score_index,
const index_t element_width,
const index_t num_batches,
const index_t num_elems,
const index_t start_index,
const index_t topk);
template <typename DType>
__global__ void ReduceNMSResultRestKernel(DType* data,
const uint32_t* nms_results,
const index_t score_index,
const index_t element_width,
const index_t num_batches,
const index_t num_elements_per_batch,
const index_t start_index,
const index_t topk,
const index_t num_blocks_per_batch);
template <typename DType>
struct NMS {
static constexpr int THRESHOLD = 512;
void operator()(Tensor<gpu, 3, DType>* data,
Tensor<gpu, 2, uint32_t>* scratch,
const index_t topk,
const BoxNMSParam& param,
Stream<gpu>* s) {
const int n_threads = 512;
const index_t num_batches = data->shape_[0];
const index_t num_elements_per_batch = data->shape_[1];
const index_t element_width = data->shape_[2];
for (index_t current_start = 0; current_start < topk; current_start += THRESHOLD) {
const index_t n_elems = topk - current_start;
const index_t num_blocks_per_row_batch = ceil_div(n_elems, n_threads);
const index_t num_blocks_per_row = num_blocks_per_row_batch * num_batches;
const index_t n_blocks = THRESHOLD / (sizeof(uint32_t) * 8) * num_blocks_per_row;
if (param.in_format == box_common_enum::kCorner) {
CalculateGreedyNMSResultsKernel<box_common_enum::kCorner>
<<<n_blocks, n_threads, 0, Stream<gpu>::GetStream(s)>>>(
data->dptr_,
scratch->dptr_,
current_start,
n_elems,
num_batches,
num_blocks_per_row_batch,
num_blocks_per_row,
topk,
element_width,
num_elements_per_batch,
param.coord_start,
param.force_suppress ? -1 : param.id_index,
param.score_index,
param.overlap_thresh);
} else {
CalculateGreedyNMSResultsKernel<box_common_enum::kCenter>
<<<n_blocks, n_threads, 0, Stream<gpu>::GetStream(s)>>>(
data->dptr_,
scratch->dptr_,
current_start,
n_elems,
num_batches,
num_blocks_per_row_batch,
num_blocks_per_row,
topk,
element_width,
num_elements_per_batch,
param.coord_start,
param.force_suppress ? -1 : param.id_index,
param.score_index,
param.overlap_thresh);
}
ReduceNMSResultTriangleKernel<<<num_batches, THRESHOLD, 0, Stream<gpu>::GetStream(s)>>>(
scratch->dptr_,
data->dptr_,
param.score_index,
element_width,
num_batches,
num_elements_per_batch,
current_start,
topk);
const index_t n_rest_elems = n_elems - THRESHOLD;
const index_t num_rest_blocks_per_batch = ceil_div(n_rest_elems, n_threads);
const index_t num_rest_blocks = num_rest_blocks_per_batch * num_batches;
if (n_rest_elems > 0) {
ReduceNMSResultRestKernel<<<num_rest_blocks, n_threads, 0, Stream<gpu>::GetStream(s)>>>(
data->dptr_,
scratch->dptr_,
param.score_index,
element_width,
num_batches,
num_elements_per_batch,
current_start,
topk,
num_rest_blocks_per_batch);
}
}
}
};
template <int encode, typename DType>
__device__ __forceinline__ DType
calculate_area(const DType b0, const DType b1, const DType b2, const DType b3) {
DType width = b2;
DType height = b3;
if (encode == box_common_enum::kCorner) {
width -= b0;
height -= b1;
}
if (width < 0 || height < 0)
return 0;
return width * height;
}
template <int encode, typename DType>
__device__ __forceinline__ DType calculate_intersection(const DType a0,
const DType a1,
const DType a2,
const DType a3,
const DType b0,
const DType b1,
const DType b2,
const DType b3) {
DType wx, wy;
if (encode == box_common_enum::kCorner) {
const DType left = a0 > b0 ? a0 : b0;
const DType bottom = a1 > b1 ? a1 : b1;
const DType right = a2 < b2 ? a2 : b2;
const DType top = a3 < b3 ? a3 : b3;
wx = right - left;
wy = top - bottom;
} else {
const DType al = 2 * a0 - a2;
const DType ar = 2 * a0 + a2;
const DType bl = 2 * b0 - b2;
const DType br = 2 * b0 + b2;
const DType left = bl > al ? bl : al;
const DType right = br < ar ? br : ar;
wx = right - left;
const DType ab = 2 * a1 - a3;
const DType at = 2 * a1 + a3;
const DType bb = 2 * b1 - b3;
const DType bt = 2 * b1 + b3;
const DType bottom = bb > ab ? bb : ab;
const DType top = bt < at ? bt : at;
wy = top - bottom;
wy = wy / 4; // To compensate for both wx and wy being 2x too large
}
if (wx <= 0 || wy <= 0) {
return 0;
} else {
return (wx * wy);
}
}
template <int encode, typename DType>
__launch_bounds__(512) __global__
void CalculateGreedyNMSResultsKernel(const DType* data,
uint32_t* result,
const index_t current_start,
const index_t num_elems,
const index_t num_batches,
const index_t num_blocks_per_row_batch,
const index_t num_blocks_per_row,
const index_t topk,
const index_t element_width,
const index_t num_elements_per_batch,
const int coord_index,
const int class_index,
const int score_index,
const float threshold) {
constexpr int max_elem_width = 20;
constexpr int num_other_boxes = sizeof(uint32_t) * 8;
__shared__ DType other_boxes[max_elem_width * num_other_boxes];
__shared__ DType other_boxes_areas[num_other_boxes];
const index_t my_row = blockIdx.x / num_blocks_per_row;
const index_t my_block_offset_in_row = blockIdx.x % num_blocks_per_row;
const index_t my_block_offset_in_batch = my_block_offset_in_row % num_blocks_per_row_batch;
const index_t my_batch = (my_block_offset_in_row) / num_blocks_per_row_batch;
const index_t my_element_in_batch =
my_block_offset_in_batch * blockDim.x + current_start + threadIdx.x;
// Load other boxes
const index_t offset =
(my_batch * num_elements_per_batch + current_start + my_row * num_other_boxes) *
element_width;
for (int i = threadIdx.x; i < element_width * num_other_boxes; i += blockDim.x) {
other_boxes[i] = data[offset + i];
}
__syncthreads();
if (threadIdx.x < num_other_boxes) {
const int other_boxes_offset = element_width * threadIdx.x;
const DType their_area =
calculate_area<encode>(other_boxes[other_boxes_offset + coord_index + 0],
other_boxes[other_boxes_offset + coord_index + 1],
other_boxes[other_boxes_offset + coord_index + 2],
other_boxes[other_boxes_offset + coord_index + 3]);
other_boxes_areas[threadIdx.x] = their_area;
}
__syncthreads();
if (my_element_in_batch >= topk)
return;
DType my_box[4];
DType my_class = -1;
DType my_score = -1;
const index_t my_offset =
(my_batch * num_elements_per_batch + my_element_in_batch) * element_width;
my_score = data[my_offset + score_index];
#pragma unroll
for (int i = 0; i < 4; ++i) {
my_box[i] = data[my_offset + coord_index + i];
}
if (class_index != -1) {
my_class = data[my_offset + class_index];
}
DType my_area = calculate_area<encode>(my_box[0], my_box[1], my_box[2], my_box[3]);
uint32_t ret = 0;
if (my_score != -1) {
#pragma unroll
for (int i = 0; i < num_other_boxes; ++i) {
const int other_boxes_offset = element_width * i;
if ((class_index == -1 || my_class == other_boxes[other_boxes_offset + class_index]) &&
other_boxes[other_boxes_offset + score_index] != -1) {
const DType their_area = other_boxes_areas[i];
const DType intersect =
calculate_intersection<encode>(my_box[0],
my_box[1],
my_box[2],
my_box[3],
other_boxes[other_boxes_offset + coord_index + 0],
other_boxes[other_boxes_offset + coord_index + 1],
other_boxes[other_boxes_offset + coord_index + 2],
other_boxes[other_boxes_offset + coord_index + 3]);
if (intersect > threshold * (my_area + their_area - intersect)) {
ret = ret | (1u << i);
}
}
}
}
result[(my_row * num_batches + my_batch) * topk + my_element_in_batch] = ~ret;
}
template <typename DType>
__launch_bounds__(NMS<DType>::THRESHOLD) __global__
void ReduceNMSResultTriangleKernel(uint32_t* nms_results,
DType* data,
const index_t score_index,
const index_t element_width,
const index_t num_batches,
const index_t num_elements_per_batch,
const index_t start_index,
const index_t topk) {
constexpr int n_threads = NMS<DType>::THRESHOLD;
constexpr int warp_size = 32;
const index_t my_batch = blockIdx.x;
const index_t my_element_in_batch = threadIdx.x + start_index;
const index_t my_element = my_batch * topk + my_element_in_batch;
const int my_warp = threadIdx.x / warp_size;
const int my_lane = threadIdx.x % warp_size;
__shared__ uint32_t current_valid_boxes[n_threads / warp_size];
const uint32_t full_mask = 0xFFFFFFFF;
const uint32_t my_lane_mask = 1 << my_lane;
const uint32_t earlier_threads_mask = (1 << (my_lane + 1)) - 1;
uint32_t valid = my_lane_mask;
uint32_t valid_boxes = full_mask;
uint32_t my_next_mask = my_element_in_batch < topk ? nms_results[my_element] : full_mask;
#pragma unroll
for (int i = 0; i < n_threads / warp_size; ++i) {
uint32_t my_mask = my_next_mask;
my_next_mask = (((i + 1) < n_threads / warp_size) && (my_element_in_batch < topk)) ?
nms_results[(i + 1) * topk * num_batches + my_element] :
full_mask;
if (my_warp == i && !__all_sync(full_mask, my_mask == full_mask)) {
my_mask = my_mask | earlier_threads_mask;
// Loop over warp_size - 1 because the last
// thread does not contribute to the mask anyway
#pragma unroll
for (int j = 0; j < warp_size - 1; ++j) {
const uint32_t mask = __shfl_sync(full_mask, valid ? my_mask : full_mask, j);
valid = valid & mask;
}
valid_boxes = __ballot_sync(full_mask, valid);
}
if (my_lane == 0 && my_warp == i) {
current_valid_boxes[i] = valid_boxes;
}
__syncthreads();
if ((my_warp > i) && (((~my_mask) & current_valid_boxes[i]) != 0)) {
valid = 0;
}
}
if (my_lane == 0) {
nms_results[my_element] = valid_boxes;
}
if (valid == 0) {
data[(my_batch * num_elements_per_batch + my_element_in_batch) * element_width + score_index] =
-1;
}
}
template <typename DType>
__launch_bounds__(512) __global__
void ReduceNMSResultRestKernel(DType* data,
const uint32_t* nms_results,
const index_t score_index,
const index_t element_width,
const index_t num_batches,
const index_t num_elements_per_batch,
const index_t start_index,
const index_t topk,
const index_t num_blocks_per_batch) {
constexpr int num_other_boxes = sizeof(uint32_t) * 8;
constexpr int num_iterations = NMS<DType>::THRESHOLD / num_other_boxes;
constexpr int warp_size = 32;
const index_t my_block_offset_in_batch = blockIdx.x % num_blocks_per_batch;
const index_t my_batch = blockIdx.x / num_blocks_per_batch;
const index_t my_element_in_batch =
my_block_offset_in_batch * blockDim.x + start_index + NMS<DType>::THRESHOLD + threadIdx.x;
const index_t my_element = my_batch * topk + my_element_in_batch;
if (my_element_in_batch >= topk)
return;
bool valid = true;
#pragma unroll
for (int i = 0; i < num_iterations; ++i) {
const uint32_t my_mask = nms_results[i * topk * num_batches + my_element];
const uint32_t valid_boxes = nms_results[my_batch * topk + i * warp_size + start_index];
const bool no_hit = (valid_boxes & (~my_mask)) == 0;
valid = valid && no_hit;
}
if (!valid) {
data[(my_batch * num_elements_per_batch + my_element_in_batch) * element_width + score_index] =
-1;
}
}
template <typename DType>
TempWorkspace<DType> GetWorkspace(const index_t num_batch,
const index_t num_elem,
const int width_elem,
const index_t topk,
const OpContext& ctx) {
TempWorkspace<DType> workspace;
Stream<gpu>* s = ctx.get_stream<gpu>();
const int alignment = 128;
// Get the workspace size
workspace.scores_temp_space = 2 * align(num_batch * num_elem * sizeof(DType), alignment);
workspace.indices_temp_spaces = 2 * align(num_batch * num_elem * sizeof(index_t), alignment);
WorkspaceForSort(num_elem, topk, alignment, &workspace);
// Place for a buffer
workspace.buffer_space = align(num_batch * num_elem * width_elem * sizeof(DType), alignment);
workspace.nms_scratch_space =
align(NMS<DType>::THRESHOLD / (sizeof(uint32_t) * 8) * num_batch * topk * sizeof(uint32_t),
alignment);
const size_t workspace_size = workspace.scores_temp_space + workspace.scratch_space +
workspace.buffer_space + workspace.nms_scratch_space +
workspace.indices_temp_spaces;
// Obtain the memory for workspace
Tensor<gpu, 1, double> scratch_memory =
ctx.requested[box_nms_enum::kTempSpace].get_space_typed<gpu, 1, double>(
mshadow::Shape1(ceil_div(workspace_size, sizeof(double))), s);
// Populate workspace pointers
workspace.scores = reinterpret_cast<DType*>(scratch_memory.dptr_);
workspace.scratch = reinterpret_cast<uint8_t*>(workspace.scores) + workspace.scores_temp_space;
workspace.buffer = reinterpret_cast<DType*>(workspace.scratch + workspace.scratch_space);
workspace.nms_scratch = reinterpret_cast<uint32_t*>(reinterpret_cast<uint8_t*>(workspace.buffer) +
workspace.buffer_space);
workspace.indices = reinterpret_cast<index_t*>(reinterpret_cast<uint8_t*>(workspace.nms_scratch) +
workspace.nms_scratch_space);
return workspace;
}
template <typename DType>
__global__ void ExtractScoresKernel(const DType* data,
DType* scores,
const index_t N,
const int element_width,
const int score_index) {
const index_t tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < N) {
scores[tid] = data[tid * element_width + score_index];
}
}
template <typename DType>
void CompactNMSResults(const Tensor<gpu, 3, DType>& data,
Tensor<gpu, 3, DType>* out,
Tensor<gpu, 1, index_t>* indices,
Tensor<gpu, 1, DType>* scores,
Tensor<gpu, 1, index_t>* sorted_indices,
Tensor<gpu, 1, DType>* sorted_scores,
Tensor<gpu, 1, char>* scratch,
const int score_index,
const index_t topk,
Stream<gpu>* s) {
using mshadow::Shape1;
constexpr int n_threads = 512;
const index_t num_elements = scores->shape_.Size();
const index_t num_elements_per_batch = data.shape_[1];
const index_t num_batches = data.shape_[0];
const int element_width = data.shape_[2];
const index_t n_blocks = ceil_div(num_elements, n_threads);
ExtractScoresKernel<<<n_blocks, n_threads, 0, Stream<gpu>::GetStream(s)>>>(
data.dptr_, scores->dptr_, num_elements, element_width, score_index);
*indices = mshadow::expr::range<index_t>(0, num_elements);
for (index_t i = 0; i < num_batches; ++i) {
// Sort each batch separately
Tensor<gpu, 1, DType> scores_batch(scores->dptr_ + i * num_elements_per_batch, Shape1(topk), s);
Tensor<gpu, 1, index_t> indices_batch(
indices->dptr_ + i * num_elements_per_batch, Shape1(topk), s);
Tensor<gpu, 1, DType> sorted_scores_batch(
sorted_scores->dptr_ + i * num_elements_per_batch, Shape1(topk), s);
Tensor<gpu, 1, index_t> sorted_indices_batch(
sorted_indices->dptr_ + i * num_elements_per_batch, Shape1(topk), s);
mxnet::op::SortByKey(scores_batch,
indices_batch,
false,
scratch,
0,
8 * sizeof(DType),
&sorted_scores_batch,
&sorted_indices_batch);
}
CompactData<true>(*sorted_indices, data, out, topk, score_index, s);
}
} // namespace
void BoxNMSForwardGPU_notemp(const nnvm::NodeAttrs& attrs,
const OpContext& ctx,
const std::vector<TBlob>& inputs,
const std::vector<OpReqType>& req,
const std::vector<TBlob>& outputs) {
using mshadow::Shape1;
using mshadow::Shape2;
using mshadow::Shape3;
CHECK_NE(req[0], kAddTo) << "BoxNMS does not support kAddTo";
CHECK_NE(req[0], kWriteInplace) << "BoxNMS does not support in place computation";
CHECK_EQ(inputs.size(), 1U);
CHECK_EQ(outputs.size(), 2U) << "BoxNMS output: [output, temp]";
const BoxNMSParam& param = nnvm::get<BoxNMSParam>(attrs.parsed);
Stream<gpu>* s = ctx.get_stream<gpu>();
mxnet::TShape in_shape = inputs[box_nms_enum::kData].shape_;
int indim = in_shape.ndim();
int num_batch = indim <= 2 ? 1 : in_shape.ProdShape(0, indim - 2);
int num_elem = in_shape[indim - 2];
int width_elem = in_shape[indim - 1];
MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
Tensor<gpu, 3, DType> data = inputs[box_nms_enum::kData].get_with_shape<gpu, 3, DType>(
Shape3(num_batch, num_elem, width_elem), s);
Tensor<gpu, 3, DType> out = outputs[box_nms_enum::kOut].get_with_shape<gpu, 3, DType>(
Shape3(num_batch, num_elem, width_elem), s);
// Special case for topk == 0
if (param.topk == 0) {
if (req[0] != kNullOp && req[0] != kWriteInplace) {
out = mshadow::expr::F<mshadow_op::identity>(data);
}
return;
}
index_t topk = param.topk > 0 ? std::min(param.topk, num_elem) : num_elem;
const auto& workspace = GetWorkspace<DType>(num_batch, num_elem, width_elem, topk, ctx);
FilterAndPrepareAuxData(data, &out, workspace, param, s);
Tensor<gpu, 1, DType> scores(workspace.scores, Shape1(num_batch * num_elem), s);
Tensor<gpu, 1, DType> sorted_scores(
workspace.scores + scores.MSize(), Shape1(num_batch * num_elem), s);
Tensor<gpu, 1, index_t> indices(workspace.indices, Shape1(num_batch * num_elem), s);
Tensor<gpu, 1, index_t> sorted_indices(
workspace.indices + indices.MSize(), Shape1(num_batch * num_elem), s);
Tensor<gpu, 1, char> scratch(
reinterpret_cast<char*>(workspace.scratch), Shape1(workspace.scratch_space), s);
Tensor<gpu, 3, DType> buffer(workspace.buffer, Shape3(num_batch, num_elem, width_elem), s);
Tensor<gpu, 2, uint32_t> nms_scratch(
workspace.nms_scratch,
Shape2(NMS<DType>::THRESHOLD / (sizeof(uint32_t) * 8), topk * num_batch),
s);
indices = mshadow::expr::range<index_t>(0, num_batch * num_elem);
for (index_t i = 0; i < num_batch; ++i) {
// Sort each batch separately
Tensor<gpu, 1, DType> scores_batch(scores.dptr_ + i * num_elem, Shape1(num_elem), s);
Tensor<gpu, 1, index_t> indices_batch(indices.dptr_ + i * num_elem, Shape1(num_elem), s);
Tensor<gpu, 1, DType> sorted_scores_batch(
sorted_scores.dptr_ + i * num_elem, Shape1(num_elem), s);
Tensor<gpu, 1, index_t> sorted_indices_batch(
sorted_indices.dptr_ + i * num_elem, Shape1(num_elem), s);
mxnet::op::SortByKey(scores_batch,
indices_batch,
false,
&scratch,
0,
8 * sizeof(DType),
&sorted_scores_batch,
&sorted_indices_batch);
}
CompactData<false>(sorted_indices, out, &buffer, topk, -1, s);
NMS<DType> nms;
nms(&buffer, &nms_scratch, topk, param, s);
CompactNMSResults(buffer,
&out,
&indices,
&scores,
&sorted_indices,
&sorted_scores,
&scratch,
param.score_index,
topk,
s);
// convert encoding
if (param.in_format != param.out_format) {
if (box_common_enum::kCenter == param.out_format) {
mxnet::op::mxnet_op::Kernel<corner_to_center, gpu>::Launch(
s, num_batch * num_elem, out.dptr_ + param.coord_start, width_elem);
} else {
mxnet::op::mxnet_op::Kernel<center_to_corner, gpu>::Launch(
s, num_batch * num_elem, out.dptr_ + param.coord_start, width_elem);
}
}
});
}
void BoxNMSForwardGPU(const nnvm::NodeAttrs& attrs,
const OpContext& ctx,
const std::vector<TBlob>& inputs,
const std::vector<OpReqType>& req,
const std::vector<TBlob>& outputs) {
using namespace mshadow;
using namespace mshadow::expr;
using namespace mxnet_op;
CHECK_EQ(inputs.size(), 1U);
CHECK_EQ(outputs.size(), 2U) << "BoxNMS output: [output, temp]";
if (req[1] == kNullOp) {
BoxNMSForwardGPU_notemp(attrs, ctx, inputs, req, outputs);
return;
}
BoxNMSForward<gpu>(attrs, ctx, inputs, req, outputs);
}
NNVM_REGISTER_OP(_contrib_box_nms).set_attr<FCompute>("FCompute<gpu>", BoxNMSForwardGPU);
NNVM_REGISTER_OP(_backward_contrib_box_nms)
.set_attr<FCompute>("FCompute<gpu>", BoxNMSBackward<gpu>);
NNVM_REGISTER_OP(_contrib_box_iou).set_attr<FCompute>("FCompute<gpu>", BoxOverlapForward<gpu>);
NNVM_REGISTER_OP(_backward_contrib_box_iou)
.set_attr<FCompute>("FCompute<gpu>", BoxOverlapBackward<gpu>);
NNVM_REGISTER_OP(_contrib_bipartite_matching)
.set_attr<FCompute>("FCompute<gpu>", BipartiteMatchingForward<gpu>);
NNVM_REGISTER_OP(_backward_contrib_bipartite_matching)
.set_attr<FCompute>("FCompute<gpu>", BipartiteMatchingBackward<gpu>);
NNVM_REGISTER_OP(_contrib_box_encode).set_attr<FCompute>("FCompute<gpu>", BoxEncodeForward<gpu>);
NNVM_REGISTER_OP(_contrib_box_decode).set_attr<FCompute>("FCompute<gpu>", BoxDecodeForward<gpu>);
} // namespace op
} // namespace mxnet