| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| |
| #include "broadcast_reduce-inl.h" |
| #include "elemwise_unary_op.h" |
| |
| #if MXNET_USE_CUDA |
| #include "../../common/cuda/rtc.h" |
| #endif // MXNET_USE_CUDA |
| |
| using namespace mshadow; |
| |
| namespace mxnet { |
| namespace op { |
| namespace broadcast { |
| |
| #if MXNET_USE_CUDA |
| |
| namespace { |
| |
| struct reduce_kernel_params { |
| index_t big_shape[MAX_DIM]; |
| index_t small_shape[MAX_DIM]; |
| index_t lhs_shape0[MAX_DIM]; |
| index_t rhs_shape0[MAX_DIM]; |
| index_t rshape[MAX_DIM]; |
| index_t rstride[MAX_DIM]; |
| index_t lhs_stride[MAX_DIM]; |
| index_t rhs_stride[MAX_DIM]; |
| index_t lhs_shape[MAX_DIM]; |
| index_t rhs_shape[MAX_DIM]; |
| }; |
| |
| const char reduce_function_code[] = R"code( |
| #define FUNC OP(IType0::from(big[idx_big[u]])) |
| using AType = typename AccType<InputType0>::type; |
| )code"; |
| |
| const char reduce_function_use_input_code[] = R"code( |
| #define FUNC OP1(IType0::from(big[idx_big[u]]), \ |
| OP2(IType1::from(lhs[idx_lhs[u]]), \ |
| IType2::from(rhs[idx_rhs[u]]))) |
| using AType = typename AccType<InputType0>::type; |
| )code"; |
| |
| const char reduce_function_index_code[] = R"code( |
| #define FUNC AType(OP(IType0::from(big[idx_big[u]])), index) |
| |
| template <typename T> |
| struct AccIndex { |
| index_t idx; |
| T num; |
| |
| __device__ inline AccIndex<T>() {} |
| __device__ inline AccIndex<T>(const T& val, const index_t idx) : num(val), idx(idx) {} |
| |
| __device__ inline operator index_t() const volatile { |
| return idx; |
| } |
| |
| __device__ inline AccIndex<T>& operator=(const AccIndex<T>& other) { |
| idx = other.idx; |
| num = other.num; |
| return *this; |
| } |
| }; |
| |
| using AType = AccIndex<typename AccType<InputType0>::type>; |
| )code"; |
| |
| const char reduce_kernel_code[] = R"code( |
| struct reduce_kernel_params { |
| index_t big_shape[util::MAX_DIM]; |
| index_t small_shape[util::MAX_DIM]; |
| index_t lhs_shape0[util::MAX_DIM]; |
| index_t rhs_shape0[util::MAX_DIM]; |
| index_t rshape[util::MAX_DIM]; |
| index_t rstride[util::MAX_DIM]; |
| index_t lhs_stride[util::MAX_DIM]; |
| index_t rhs_stride[util::MAX_DIM]; |
| index_t lhs_shape[util::MAX_DIM]; |
| index_t rhs_shape[util::MAX_DIM]; |
| }; |
| |
| inline __device__ AType reduce(const index_t idx, const int tidx, |
| const int tidy, const int N, |
| const index_t Mstart, const index_t Mend, |
| const InputType0* __restrict big, |
| const InputType1* __restrict lhs, |
| const InputType2* __restrict rhs, |
| const reduce_kernel_params& params) { |
| extern __shared__ char shTileChar[]; |
| using IType0 = AccType<InputType0>; |
| using IType1 = AccType<InputType1>; |
| using IType2 = AccType<InputType2>; |
| using OType = AccType<OutputType0>; |
| AType* shTile = (AType*)(shTileChar); |
| const int bx = (do_transpose) ? blockDim.y : blockDim.x; |
| const int by = (do_transpose) ? blockDim.x : blockDim.y; |
| index_t coord[ndim]; |
| util::unravel(idx, params.small_shape, coord); |
| index_t idx_big0, idx_lhs0, idx_rhs0; |
| idx_big0 = util::ravel(coord, params.big_shape); |
| if (use_input) { |
| idx_lhs0 = util::ravel(coord, params.lhs_shape0); |
| idx_rhs0 = util::ravel(coord, params.rhs_shape0); |
| } |
| |
| AType val, residual; |
| REDUCER.SetInitValue(val, residual); |
| if (idx < N) { |
| for (index_t k = tidy + Mstart; k < Mend; k += by*UNROLL) { |
| index_t idx_big[UNROLL]; |
| index_t idx_lhs[UNROLL]; |
| index_t idx_rhs[UNROLL]; |
| #pragma unroll |
| for (int u=0;u < UNROLL;u++) { |
| idx_big[u] = idx_big0 + util::unravel_dot<ndim>(k + u*by, params.rshape, |
| params.rstride); |
| if (use_input) { |
| idx_lhs[u] = idx_lhs0 + util::unravel_dot<ndim>(k + u*by, params.lhs_shape, |
| params.lhs_stride); |
| idx_rhs[u] = idx_rhs0 + util::unravel_dot<ndim>(k + u*by, params.rhs_shape, |
| params.rhs_stride); |
| } |
| } |
| AType tmp[UNROLL]; |
| #pragma unroll |
| for (int u=0;u < UNROLL;u++) { |
| if (k + u*by < Mend) { |
| const index_t index = k + u*by; |
| tmp[u] = FUNC; |
| } |
| } |
| #pragma unroll |
| for (int u=0;u < UNROLL;u++) { |
| if (k + u*by < Mend) REDUCER.Reduce(val, tmp[u], residual); |
| } |
| } |
| } |
| |
| // Shared memory block bx * by. Reduction is along by. Final result is in tidy=0 |
| if (by > 1) { |
| // Fix bx to avoid bank conflicts. Assumes warpSize number of banks |
| const int fbx = (do_transpose && ((bx & (warpSize - 1)) == 0)) ? (bx + 1) : bx; |
| const int it0 = tidx + tidy*fbx; |
| shTile[it0 * 2] = val; |
| shTile[it0 * 2 + 1] = residual; |
| __syncthreads(); |
| for (int t=1;t < by;t <<= 1) { |
| AType tmp, tmp_residual; |
| REDUCER.SetInitValue(tmp, tmp_residual); |
| if (tidy + t < by) { |
| tmp = shTile[(it0 + t*fbx) * 2]; |
| tmp_residual = shTile[(it0 + t*fbx) * 2 + 1]; |
| } |
| __syncthreads(); |
| REDUCER.Merge(shTile[it0 * 2], shTile[it0 * 2 + 1], tmp, tmp_residual); |
| __syncthreads(); |
| } |
| if (idx < N && tidy == 0) { |
| REDUCER.Finalize(shTile[tidx * 2], shTile[tidx * 2 + 1]); |
| return shTile[tidx * 2]; |
| } else { |
| return AType(); |
| } |
| } else { |
| if (idx < N) { |
| REDUCER.Finalize(val, residual); |
| return val; |
| } else { |
| return AType(); |
| } |
| } |
| } |
| |
| __launch_bounds__(kRTCMaxThreadsPerBlock) |
| __global__ void reduce_kernel_single(const int N, const int M, |
| const InputType0* __restrict big, |
| const InputType1* __restrict lhs, |
| const InputType2* __restrict rhs, |
| OutputType0 *small, |
| const reduce_kernel_params params, |
| const int Mnext) { |
| using OType = AccType<OutputType0>; |
| const int tid = threadIdx.x + threadIdx.y*blockDim.x; |
| const int bx = (do_transpose) ? blockDim.y : blockDim.x; |
| const int by = (do_transpose) ? blockDim.x : blockDim.y; |
| const int tidx = (do_transpose) ? tid / by : threadIdx.x; |
| const int tidy = (do_transpose) ? tid % by : threadIdx.y; |
| for (int m0 = blockIdx.y; m0 < Mnext; m0 += gridDim.y) { |
| // This TB handles M range [Mstart, ...., Mend - 1] |
| const index_t Mstart = (index_t)((int64)M*(int64)m0/(int64)Mnext); |
| const index_t Mend = (index_t)((int64)M*(int64)(m0 + 1)/(int64)Mnext); |
| for (index_t idx0 = blockIdx.x*bx; idx0 < N; idx0 += bx*gridDim.x) { |
| const index_t idx = idx0 + tidx; |
| AType val = reduce(idx, tidx, tidy, N, Mstart, Mend, big, lhs, rhs, params); |
| if (idx < N && (by == 1 || tidy == 0)) { |
| if (req == OpReqType::kAddTo) { |
| small[idx + m0 * N] = OType::to(op::add(OType::from(small[idx + m0 * N]), |
| static_cast<typename OType::type>(val))); |
| } else { |
| small[idx + m0 * N] = OType::to(val); |
| } |
| } |
| } |
| } |
| } |
| |
| __launch_bounds__(kRTCMaxThreadsPerBlock) |
| __global__ void reduce_kernel_multi(const int N, const int M, |
| const InputType0* __restrict big, |
| const InputType1* __restrict lhs, |
| const InputType2* __restrict rhs, |
| AType *small, |
| const reduce_kernel_params params, |
| const int Mnext) { |
| const int tid = threadIdx.x + threadIdx.y*blockDim.x; |
| const int bx = (do_transpose) ? blockDim.y : blockDim.x; |
| const int by = (do_transpose) ? blockDim.x : blockDim.y; |
| const int tidx = (do_transpose) ? tid / by : threadIdx.x; |
| const int tidy = (do_transpose) ? tid % by : threadIdx.y; |
| for (int m0 = blockIdx.y; m0 < Mnext; m0 += gridDim.y) { |
| // This TB handles M range [Mstart, ...., Mend - 1] |
| const index_t Mstart = (index_t)((int64)M*(int64)m0/(int64)Mnext); |
| const index_t Mend = (index_t)((int64)M*(int64)(m0 + 1)/(int64)Mnext); |
| for (index_t idx0 = blockIdx.x*bx; idx0 < N; idx0 += bx*gridDim.x) { |
| const index_t idx = idx0 + tidx; |
| AType val = reduce(idx, tidx, tidy, N, Mstart, Mend, big, lhs, rhs, params); |
| if (idx < N && (by == 1 || tidy == 0)) { |
| small[idx + m0 * N] = val; |
| } |
| } |
| } |
| } |
| |
| )code"; |
| |
| const char reduce_lines_kernel_code[] = R"code( |
| __launch_bounds__(kRTCMaxThreadsPerBlock) |
| __global__ void reduce_lines_kernel(const index_t N, const index_t M, |
| const index_t small_in_stride, |
| const AType* __restrict small_in, |
| OutputType0 *small_out) { |
| using OType = AccType<OutputType0>; |
| for (index_t idx = threadIdx.x + blockIdx.x*blockDim.x; idx < N; idx += blockDim.x*gridDim.x) { |
| AType val, residual; |
| REDUCER.SetInitValue(val, residual); |
| for (int k = 0; k < M; k++) { |
| REDUCER.Reduce(val, |
| small_in[idx + k*small_in_stride], |
| residual); |
| } |
| |
| if (idx < N) { |
| REDUCER.Finalize(val, residual); |
| if (req == OpReqType::kAddTo) { |
| small_out[idx] = OType::to(op::add(OType::from(small_out[idx]), |
| static_cast<typename OType::type>(val))); |
| } else { |
| small_out[idx] = OType::to(val); |
| } |
| } |
| } |
| } |
| )code"; |
| |
| void RTCReduceImpl(Stream<gpu>* s, |
| const TBlob& small, |
| const bool addto, |
| const TBlob& big, |
| const Tensor<gpu, 1, char>& workspace, |
| const ReduceImplConfig& config, |
| const int ndim, |
| const std::string& common_code, |
| int dev_id, |
| const TBlob* lhs = nullptr, |
| const TBlob* rhs = nullptr, |
| const bool use_index = false) { |
| using namespace common::cuda::rtc; |
| void* small_dptr = small.dptr_; |
| if (config.Mnext > 1) { |
| // small_dptr[] is N*Mnext*sizeof(DType) bytes |
| small_dptr = workspace.dptr_; |
| // Check that the workspace is contigiuous |
| CHECK_EQ(workspace.CheckContiguous(), true); |
| // Check that we have enough storage |
| CHECK_GE(workspace.size(0), config.workspace_size); |
| } |
| |
| const int by = |
| (config.kernel_1.do_transpose) ? config.kernel_1.blockDim.x : config.kernel_1.blockDim.y; |
| const bool do_unroll = (config.M / (by * config.Mnext) >= unroll_reduce); |
| std::string code = |
| common_code + "#define UNROLL " + (do_unroll ? std::to_string(unroll_reduce) : "1") + |
| "\n" |
| "const bool do_transpose = " + |
| (config.kernel_1.do_transpose ? "true" : "false") + |
| ";\n" |
| "using InputType0 = " + |
| common::mshadow_type_info(big.type_flag_).name + |
| ";\n" |
| "using OutputType0 = " + |
| common::mshadow_type_info(small.type_flag_).name + |
| ";\n" |
| "using InputType1 = " + |
| ((lhs != nullptr) ? common::mshadow_type_info(lhs->type_flag_).name : "float32") + |
| ";\n" |
| "using InputType2 = " + |
| ((rhs != nullptr) ? common::mshadow_type_info(rhs->type_flag_).name : "float32") + ";\n"; |
| if (lhs != nullptr) { |
| code += "const bool use_input = true;"; |
| } else { |
| code += "const bool use_input = false;"; |
| } |
| |
| reduce_kernel_params param{}; |
| for (int i = 0; i < ndim; ++i) { |
| param.big_shape[i] = big.shape_[i]; |
| param.small_shape[i] = small.shape_[i]; |
| param.rshape[i] = config.rshape[i]; |
| param.rstride[i] = config.rstride[i]; |
| if (lhs != nullptr) { |
| param.lhs_shape0[i] = lhs->shape_[i]; |
| param.rhs_shape0[i] = rhs->shape_[i]; |
| param.lhs_shape[i] = config.lhs_shape[i]; |
| param.rhs_shape[i] = config.rhs_shape[i]; |
| param.lhs_stride[i] = config.lhs_stride[i]; |
| param.rhs_stride[i] = config.rhs_stride[i]; |
| } |
| } |
| |
| void* null_ptr = nullptr; |
| std::vector<const void*> args; |
| args.emplace_back(&config.N); |
| args.emplace_back(&config.M); |
| args.emplace_back(&big.dptr_); |
| if (lhs != nullptr) { |
| args.emplace_back(&(lhs->dptr_)); |
| args.emplace_back(&(rhs->dptr_)); |
| } else { |
| args.emplace_back(&(null_ptr)); |
| args.emplace_back(&(null_ptr)); |
| } |
| args.emplace_back(&small_dptr); |
| args.emplace_back(¶m); |
| args.emplace_back(&config.Mnext); |
| |
| const auto& function_code = (lhs == nullptr) ? |
| (use_index ? reduce_function_index_code : reduce_function_code) : |
| reduce_function_use_input_code; |
| const auto& kernel_name = (config.Mnext > 1) ? "reduce_kernel_multi" : "reduce_kernel_single"; |
| auto reduce_kernel_func = |
| get_function(code + function_code, kernel_name, reduce_kernel_code, dev_id); |
| launch(reduce_kernel_func, |
| config.kernel_1.gridDim, |
| config.kernel_1.blockDim, |
| config.kernel_1.shMemSize, |
| s, |
| &args); |
| |
| if (config.Mnext > 1) { |
| args.resize(0); |
| args.emplace_back(&config.N); |
| args.emplace_back(&config.Mnext); |
| args.emplace_back(&config.N); |
| args.emplace_back(&small_dptr); |
| args.emplace_back(&small.dptr_); |
| |
| auto reduce_lines_kernel_func = |
| get_function(code + function_code, "reduce_lines_kernel", reduce_lines_kernel_code, dev_id); |
| launch( |
| reduce_lines_kernel_func, config.kernel_2.gridSize, config.kernel_2.blockSize, 0, s, &args); |
| } |
| } |
| |
| struct reduce_kernel_M1_params { |
| index_t big_shape[MAX_DIM]; |
| index_t lhs_shape[MAX_DIM]; |
| index_t rhs_shape[MAX_DIM]; |
| index_t small_shape[MAX_DIM]; |
| }; |
| |
| const char reduce_kernel_M1_code[] = R"code( |
| struct reduce_kernel_M1_params { |
| index_t big_shape[util::MAX_DIM]; |
| index_t lhs_shape[util::MAX_DIM]; |
| index_t rhs_shape[util::MAX_DIM]; |
| index_t small_shape[util::MAX_DIM]; |
| }; |
| |
| __launch_bounds__(kRTCMaxThreadsPerBlock) |
| __global__ void reduce_kernel_M1(const int N, |
| const InputType0* __restrict big, |
| const InputType1* __restrict lhs, |
| const InputType2* __restrict rhs, |
| OutputType0 *small, |
| const reduce_kernel_M1_params params) { |
| using IType0 = AccType<InputType0>; |
| using IType1 = AccType<InputType1>; |
| using IType2 = AccType<InputType2>; |
| using OType = AccType<OutputType0>; |
| for (index_t index = threadIdx.x + blockIdx.x*blockDim.x; |
| index < N; |
| index += blockDim.x*gridDim.x) { |
| index_t coord[ndim]; |
| util::unravel(index, params.small_shape, coord); |
| index_t idx_big[1]; |
| idx_big[0] = util::ravel(coord, params.big_shape); |
| index_t idx_lhs[1], idx_rhs[1]; |
| if (use_input) { |
| idx_lhs[0] = util::ravel(coord, params.lhs_shape); |
| idx_rhs[0] = util::ravel(coord, params.rhs_shape); |
| } |
| AType val, residual; |
| REDUCER.SetInitValue(val, residual); |
| const int u = 0; |
| REDUCER.Reduce(val, FUNC, residual); |
| REDUCER.Finalize(val, residual); |
| if (req == OpReqType::kAddTo) { |
| const auto temp = op::add(static_cast<typename OType::type>(val), |
| OType::from(small[index])); |
| small[index] = OType::to(temp); |
| } else { |
| small[index] = OType::to(static_cast<typename OType::type>(val)); |
| } |
| } |
| } |
| )code"; |
| |
| void RTCReduceM1Impl(Stream<gpu>* s, |
| const TBlob& small, |
| const TBlob& big, |
| const TBlob* lhs, |
| const TBlob* rhs, |
| const ReduceImplConfig& config, |
| const int ndim, |
| const std::string& common_code, |
| int dev_id, |
| const bool use_index = false) { |
| using namespace common::cuda::rtc; |
| |
| std::string code = |
| common_code + "using InputType0 = " + common::mshadow_type_info(big.type_flag_).name + |
| ";\n" |
| "using InputType1 = " + |
| ((lhs != nullptr) ? common::mshadow_type_info(lhs->type_flag_).name : "float32") + |
| ";\n" |
| "using InputType2 = " + |
| ((rhs != nullptr) ? common::mshadow_type_info(rhs->type_flag_).name : "float32") + |
| ";\n" |
| "using OutputType0 = " + |
| common::mshadow_type_info(small.type_flag_).name + ";\n"; |
| if (lhs != nullptr) { |
| code += "const bool use_input = true;"; |
| } else { |
| code += "const bool use_input = false;"; |
| } |
| |
| reduce_kernel_M1_params param{}; |
| for (int i = 0; i < ndim; ++i) { |
| param.big_shape[i] = big.shape_[i]; |
| param.small_shape[i] = small.shape_[i]; |
| if (lhs != nullptr) { |
| param.lhs_shape[i] = lhs->shape_[i]; |
| param.rhs_shape[i] = rhs->shape_[i]; |
| } |
| } |
| |
| void* null_ptr = nullptr; |
| std::vector<const void*> args; |
| args.emplace_back(&config.N); |
| args.emplace_back(&big.dptr_); |
| if (lhs != nullptr) { |
| args.emplace_back(&(lhs->dptr_)); |
| args.emplace_back(&(rhs->dptr_)); |
| } else { |
| args.emplace_back(&(null_ptr)); |
| args.emplace_back(&(null_ptr)); |
| } |
| args.emplace_back(&small.dptr_); |
| args.emplace_back(¶m); |
| |
| const auto& function_code = (lhs == nullptr) ? |
| (use_index ? reduce_function_index_code : reduce_function_code) : |
| reduce_function_use_input_code; |
| auto reduce_kernel_M1_func = |
| get_function(code + function_code, "reduce_kernel_M1", reduce_kernel_M1_code, dev_id); |
| launch(reduce_kernel_M1_func, config.kernel_1.gridDim, config.kernel_1.blockDim, 0, s, &args); |
| } |
| |
| } // namespace |
| |
| void RTCReduce(const OpContext& ctx, |
| const TBlob& small, |
| const OpReqType req, |
| const Tensor<gpu, 1, char>& workspace, |
| const TBlob& big, |
| const std::string& reducer, |
| int ndim, |
| const std::string& OP, |
| const bool use_index) { |
| using namespace mxnet::common::cuda::rtc; |
| if (req == kNullOp) |
| return; |
| Stream<gpu>* s = ctx.get_stream<gpu>(); |
| ReduceImplConfig config(small.shape_, big.shape_, nullptr, nullptr); |
| std::string common_code = std::string("const OpReqType req = ") + util::to_string(req) + |
| ";\n" |
| "#define OP op::" + |
| OP + |
| "\n" |
| "#define REDUCER " + |
| reducer + |
| "\n" |
| "const int ndim = " + |
| std::to_string(ndim) + ";\n"; |
| if (config.M == 1) { |
| RTCReduceM1Impl(s, |
| small, |
| big, |
| nullptr, |
| nullptr, |
| config, |
| ndim, |
| common_code, |
| ctx.run_ctx.ctx.dev_id, |
| use_index); |
| } else { |
| RTCReduceImpl(s, |
| small, |
| req == kAddTo, |
| big, |
| workspace, |
| config, |
| ndim, |
| common_code, |
| ctx.run_ctx.ctx.dev_id, |
| nullptr, |
| nullptr, |
| use_index); |
| } |
| } |
| |
| void RTCReduce(const OpContext& ctx, |
| const TBlob& small, |
| const OpReqType req, |
| const Tensor<gpu, 1, char>& workspace, |
| const TBlob& big, |
| const TBlob& lhs, |
| const TBlob& rhs, |
| const std::string& reducer, |
| int ndim, |
| const std::string& OP1, |
| const std::string& OP2) { |
| using namespace mxnet::common::cuda::rtc; |
| if (req == kNullOp) |
| return; |
| Stream<gpu>* s = ctx.get_stream<gpu>(); |
| ReduceImplConfig config(small.shape_, big.shape_, &lhs.shape_, &rhs.shape_); |
| std::string common_code = std::string("const OpReqType req = ") + util::to_string(req) + |
| ";\n" |
| "#define OP1 op::" + |
| OP1 + |
| "\n" |
| "#define OP2 op::" + |
| OP2 + |
| "\n" |
| "#define REDUCER " + |
| reducer + |
| "\n" |
| "const int ndim = " + |
| std::to_string(ndim) + ";\n"; |
| if (config.M == 1) { |
| RTCReduceM1Impl(s, small, big, &lhs, &rhs, config, ndim, common_code, ctx.run_ctx.ctx.dev_id); |
| } else { |
| RTCReduceImpl(s, |
| small, |
| req == kAddTo, |
| big, |
| workspace, |
| config, |
| ndim, |
| common_code, |
| ctx.run_ctx.ctx.dev_id, |
| &lhs, |
| &rhs); |
| } |
| } |
| |
| #endif // MXNET_USE_CUDA |
| |
| } // namespace broadcast |
| } // namespace op |
| } // namespace mxnet |