blob: d8fc5031e4ff7a044f9370b3aaed18cc1feff960 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/*!
* Copyright (c) 2017 by Contributors
* \file mxnet_op.h
* \brief
* \author Junyuan Xie
*/
#ifndef MXNET_OPERATOR_MXNET_OP_H_
#define MXNET_OPERATOR_MXNET_OP_H_
#include <dmlc/omp.h>
#include <mxnet/base.h>
#include <mxnet/engine.h>
#include <mxnet/op_attr_types.h>
#include <algorithm>
#include "./operator_tune.h"
#include "../engine/openmp.h"
#ifdef __CUDACC__
#include "../common/cuda_utils.h"
#endif // __CUDACC__
namespace mxnet {
namespace op {
namespace mxnet_op {
using namespace mshadow;
#ifdef __CUDA_ARCH__
__constant__ const float PI = 3.14159265358979323846;
#else
const float PI = 3.14159265358979323846;
using std::isnan;
#endif
template<typename xpu>
int get_num_threads(const int N);
#ifdef __CUDACC__
#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
i < (n); \
i += blockDim.x * gridDim.x)
inline cudaDeviceProp cuda_get_device_prop() {
int device;
CUDA_CALL(cudaGetDevice(&device));
cudaDeviceProp deviceProp;
CUDA_CALL(cudaGetDeviceProperties(&deviceProp, device));
return deviceProp;
}
/*!
* \brief Get the number of blocks for cuda kernel given N
*/
inline int cuda_get_num_blocks(const int N) {
using namespace mshadow::cuda;
return std::min(kMaxGridNum, (N + kBaseThreadNum - 1) / kBaseThreadNum);
}
template<>
inline int get_num_threads<gpu>(const int N) {
using namespace mshadow::cuda;
return kBaseThreadNum * cuda_get_num_blocks(N);
}
#endif // __CUDACC__
template<>
inline int get_num_threads<cpu>(const int N) {
return engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
}
/*! \brief operator request type switch */
#define MXNET_ASSIGN_REQ_SWITCH(req, ReqType, ...) \
switch (req) { \
case kNullOp: \
break; \
case kWriteInplace: \
case kWriteTo: \
{ \
const OpReqType ReqType = kWriteTo; \
{__VA_ARGS__} \
} \
break; \
case kAddTo: \
{ \
const OpReqType ReqType = kAddTo; \
{__VA_ARGS__} \
} \
break; \
default: \
break; \
}
/*! \brief operator request type switch */
#define MXNET_REQ_TYPE_SWITCH(req, ReqType, ...) \
switch (req) { \
case kNullOp: \
{ \
const OpReqType ReqType = kNullOp; \
{__VA_ARGS__} \
} \
break; \
case kWriteInplace: \
case kWriteTo: \
{ \
const OpReqType ReqType = kWriteTo; \
{__VA_ARGS__} \
} \
break; \
case kAddTo: \
{ \
const OpReqType ReqType = kAddTo; \
{__VA_ARGS__} \
} \
break; \
default: \
break; \
}
#define MXNET_NDIM_SWITCH(NDim, ndim, ...) \
if (NDim == 0) { \
} else if (NDim == 1) { \
const int ndim = 1; \
{__VA_ARGS__} \
} else if (NDim == 2) { \
const int ndim = 2; \
{__VA_ARGS__} \
} else if (NDim == 3) { \
const int ndim = 3; \
{__VA_ARGS__} \
} else if (NDim == 4) { \
const int ndim = 4; \
{__VA_ARGS__} \
} else if (NDim == 5) { \
const int ndim = 5; \
{__VA_ARGS__} \
} else { \
LOG(FATAL) << "ndim=" << NDim << "too large "; \
}
#define MXNET_NO_INT8_TYPE_SWITCH(type, DType, ...) \
switch (type) { \
case mshadow::kFloat32: \
{ \
typedef float DType; \
{__VA_ARGS__} \
} \
break; \
case mshadow::kFloat64: \
{ \
typedef double DType; \
{__VA_ARGS__} \
} \
break; \
case mshadow::kFloat16: \
{ \
typedef mshadow::half::half_t DType; \
{__VA_ARGS__} \
} \
break; \
case mshadow::kUint8: \
LOG(FATAL) << "This operation does not " \
"support int8 or uint8"; \
break; \
case mshadow::kInt8: \
LOG(FATAL) << "This operation does not " \
"support int8 or uint8"; \
break; \
case mshadow::kInt32: \
{ \
typedef int32_t DType; \
{__VA_ARGS__} \
} \
break; \
case mshadow::kInt64: \
{ \
typedef int64_t DType; \
{__VA_ARGS__} \
} \
break; \
default: \
LOG(FATAL) << "Unknown type enum " << type; \
}
#define MXNET_NO_FLOAT16_TYPE_SWITCH(type, DType, ...) \
switch (type) { \
case mshadow::kFloat32: \
{ \
typedef float DType; \
{__VA_ARGS__} \
} \
break; \
case mshadow::kFloat64: \
{ \
typedef double DType; \
{__VA_ARGS__} \
} \
break; \
case mshadow::kFloat16: \
LOG(FATAL) << "This operation does not " \
"support float16"; \
break; \
case mshadow::kUint8: \
{ \
typedef uint8_t DType; \
{__VA_ARGS__} \
} \
break; \
case mshadow::kInt8: \
{ \
typedef int8_t DType; \
{__VA_ARGS__} \
} \
break; \
case mshadow::kInt32: \
{ \
typedef int32_t DType; \
{__VA_ARGS__} \
} \
break; \
case mshadow::kInt64: \
{ \
typedef int64_t DType; \
{__VA_ARGS__} \
} \
break; \
default: \
LOG(FATAL) << "Unknown type enum " << type; \
}
#define MXNET_REAL_ACC_TYPE_SWITCH(type, DType, AType, ...)\
switch (type) { \
case mshadow::kFloat32: \
{ \
typedef float DType; \
typedef double AType; \
{__VA_ARGS__} \
} \
break; \
case mshadow::kFloat64: \
{ \
typedef double DType; \
typedef double AType; \
{__VA_ARGS__} \
} \
break; \
case mshadow::kFloat16: \
{ \
typedef mshadow::half::half_t DType; \
typedef float AType; \
{__VA_ARGS__} \
} \
break; \
case mshadow::kUint8: \
LOG(FATAL) << "This operation only support " \
"floating point types not uint8"; \
break; \
case mshadow::kInt8: \
LOG(FATAL) << "This operation only support " \
"floating point types not int8"; \
break; \
case mshadow::kInt32: \
LOG(FATAL) << "This operation only support " \
"floating point types, not int32"; \
break; \
case mshadow::kInt64: \
LOG(FATAL) << "This operation only support " \
"floating point types, not int64"; \
break; \
default: \
LOG(FATAL) << "Unknown type enum " << type; \
}
/*!
* \brief assign the val to out according
* to request in Kernel::Launch
* \param out the data to be assigned
* \param req the assignment request
* \param val the value to be assigned to out
* \tparam OType output type
* \tparam VType value type
*/
#define KERNEL_ASSIGN(out, req, val) \
{ \
switch (req) { \
case kNullOp: \
break; \
case kWriteTo: \
case kWriteInplace: \
(out) = (val); \
break; \
case kAddTo: \
(out) += (val); \
break; \
default: \
break; \
} \
}
#define MXNET_ADD_ALL_TYPES \
.add_enum("float32", mshadow::kFloat32) \
.add_enum("float64", mshadow::kFloat64) \
.add_enum("float16", mshadow::kFloat16) \
.add_enum("uint8", mshadow::kUint8) \
.add_enum("int8", mshadow::kInt8) \
.add_enum("int32", mshadow::kInt32) \
.add_enum("int64", mshadow::kInt64)
/* \brief Compute flattened index given coordinates and shape. */
template<int ndim>
MSHADOW_XINLINE index_t ravel(const Shape<ndim>& coord, const Shape<ndim>& shape) {
index_t ret = 0;
#pragma unroll
for (int i = 0; i < ndim; ++i) {
ret = ret * shape[i] + (shape[i] > coord[i]) * coord[i];
}
return ret;
}
/* Compute coordinates from flattened index given shape */
template<int ndim>
MSHADOW_XINLINE Shape<ndim> unravel(const index_t idx, const Shape<ndim>& shape) {
Shape<ndim> ret;
#pragma unroll
for (index_t i = ndim-1, j = idx; i >=0; --i) {
auto tmp = j / shape[i];
ret[i] = j - tmp*shape[i];
j = tmp;
}
return ret;
}
/* Compute dot product of two vector */
template<int ndim>
MSHADOW_XINLINE index_t dot(const Shape<ndim>& coord, const Shape<ndim>& stride) {
index_t ret = 0;
#pragma unroll
for (int i = 0; i < ndim; ++i) {
ret += coord[i] * stride[i];
}
return ret;
}
/* Combining unravel and dot */
template<int ndim>
MSHADOW_XINLINE index_t unravel_dot(const index_t idx, const Shape<ndim>& shape,
const Shape<ndim>& stride) {
index_t ret = 0;
#pragma unroll
for (index_t i = ndim-1, j = idx; i >=0; --i) {
auto tmp = j / shape[i];
ret += (j - tmp*shape[i])*stride[i];
j = tmp;
}
return ret;
}
/* Calculate stride of each dim from shape */
template<int ndim>
MSHADOW_XINLINE Shape<ndim> calc_stride(const Shape<ndim>& shape) {
Shape<ndim> stride;
index_t cumprod = 1;
#pragma unroll
for (int i = ndim - 1; i >= 0; --i) {
stride[i] = (shape[i] > 1) ? cumprod : 0;
cumprod *= shape[i];
}
return stride;
}
/* Increment coordinates and modify index */
template<int ndim>
MSHADOW_XINLINE void inc(Shape<ndim>* coord, const Shape<ndim>& shape,
index_t* idx, const Shape<ndim>& stride) {
++(*coord)[ndim-1];
*idx += stride[ndim-1];
#pragma unroll
for (int i = ndim - 1; i > 0 && (*coord)[i] >= shape[i]; --i) {
(*coord)[i] -= shape[i];
++(*coord)[i-1];
*idx = *idx + stride[i-1] - shape[i] * stride[i];
}
}
/* Increment coordinates and modify index */
template<int ndim>
MSHADOW_XINLINE void inc(Shape<ndim>* coord, const Shape<ndim>& shape,
index_t* idx1, const Shape<ndim>& stride1,
index_t* idx2, const Shape<ndim>& stride2) {
++(*coord)[ndim-1];
*idx1 += stride1[ndim-1];
*idx2 += stride2[ndim-1];
#pragma unroll
for (int i = ndim - 1; i > 0 && (*coord)[i] >= shape[i]; --i) {
(*coord)[i] -= shape[i];
++(*coord)[i-1];
*idx1 = *idx1 + stride1[i-1] - shape[i] * stride1[i];
*idx2 = *idx2 + stride2[i-1] - shape[i] * stride2[i];
}
}
/*!
* \brief Simple copy data from one blob to another
* \param to Destination blob
* \param from Source blob
*/
template <typename xpu>
MSHADOW_CINLINE void copy(mshadow::Stream<xpu> *s, const TBlob& to, const TBlob& from) {
CHECK_EQ(from.Size(), to.Size());
CHECK_EQ(from.dev_mask(), to.dev_mask());
MSHADOW_TYPE_SWITCH(to.type_flag_, DType, {
if (to.type_flag_ == from.type_flag_) {
mshadow::Copy(to.FlatTo1D<xpu, DType>(s), from.FlatTo1D<xpu, DType>(s), s);
} else {
MSHADOW_TYPE_SWITCH(from.type_flag_, SrcDType, {
to.FlatTo1D<xpu, DType>(s) = mshadow::expr::tcast<DType>(from.FlatTo1D<xpu, SrcDType>(s));
})
}
})
}
/*! \brief Binary op backward gradient OP wrapper */
template<typename GRAD_OP>
struct backward_grad {
/* \brief Backward calc with grad
* \param a - output grad
* \param args... - data to grad calculation op (what this is -- input, output, etc. -- varies)
* \return input grad
*/
template<typename DType, typename ...Args>
MSHADOW_XINLINE static DType Map(DType a, Args... args) {
return DType(a * GRAD_OP::Map(args...));
}
};
/*! \brief Binary op backward gradient OP wrapper (tuned) */
template<typename GRAD_OP>
struct backward_grad_tuned : public backward_grad<GRAD_OP>, public tunable {
using backward_grad<GRAD_OP>::Map;
};
/*! \brief Select assignment operation based upon the req value
* Also useful for mapping mshadow Compute (F<OP>) to Kernel<OP>::Launch
*/
template<typename OP, int req>
struct op_with_req {
typedef OP Operation;
/*! \brief input is one tensor */
template<typename DType>
MSHADOW_XINLINE static void Map(index_t i, DType *out, const DType *in) {
KERNEL_ASSIGN(out[i], req, OP::Map(in[i]));
}
/*! \brief inputs are two tensors */
template<typename DType>
MSHADOW_XINLINE static void Map(index_t i, DType *out, const DType *lhs, const DType *rhs) {
KERNEL_ASSIGN(out[i], req, OP::Map(lhs[i], rhs[i]));
}
/*! \brief input is tensor and a scalar value */
template<typename DType>
MSHADOW_XINLINE static void Map(index_t i, DType *out, const DType *in, const DType value) {
KERNEL_ASSIGN(out[i], req, OP::Map(in[i], value));
}
/*! \brief input is tensor and two scalar value */
template<typename DType>
MSHADOW_XINLINE static void Map(index_t i, DType *out, const DType *in,
const DType value_1, const DType value_2) {
KERNEL_ASSIGN(out[i], req, OP::Map(in[i], value_1, value_2));
}
/*! \brief No inputs (ie fill to constant value) */
template<typename DType>
MSHADOW_XINLINE static void Map(index_t i, DType *out) {
KERNEL_ASSIGN(out[i], req, OP::Map());
}
/*! \brief input is single scalar value */
template<typename DType>
MSHADOW_XINLINE static void Map(index_t i, DType *out, const DType value) {
KERNEL_ASSIGN(out[i], req, OP::Map(value));
}
/*! \brief inputs are two tensors and a scalar value */
template<typename DType>
MSHADOW_XINLINE static void Map(index_t i, DType *out,
const DType *input_1, const DType *input_2, const DType value) {
KERNEL_ASSIGN(out[i], req, OP::Map(input_1[i], input_2[i], value));
}
/*! \brief inputs are three tensors (ie backward grad with binary grad function) */
template<typename DType>
MSHADOW_XINLINE static void Map(index_t i, DType *out,
const DType *input_1,
const DType *input_2,
const DType *input_3) {
KERNEL_ASSIGN(out[i], req, OP::Map(input_1[i], input_2[i], input_3[i]));
}
};
template<typename OP, typename xpu>
struct Kernel;
/*!
* \brief CPU Kernel launcher
* \tparam OP Operator to launch
*/
template<typename OP>
struct Kernel<OP, cpu> {
/*!
* \brief Launch a generic CPU kernel.
* When using this for a new kernel op, add declaration and tuning objects to
* operator_tune.cc
* \tparam Args Varargs type to eventually pass to the OP::Map() function
* \param N Number of iterations
* \param args Varargs to eventually pass to the OP::Map() function
*/
template<typename ...Args>
inline static bool Launch(mshadow::Stream<cpu> *, const size_t N, Args... args) {
#ifdef _OPENMP
const int omp_threads = engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
if (omp_threads < 2) {
for (size_t i = 0; i < N; ++i) {
OP::Map(i, args...);
}
} else {
#pragma omp parallel for num_threads(omp_threads)
for (index_t i = 0; i < static_cast<index_t>(N); ++i) {
OP::Map(i, args...);
}
}
#else
for (size_t i = 0; i < N; ++i) {
OP::Map(i, args...);
}
#endif
return true;
}
/*!
* \brief Launch a generic CPU kernel with dynamic schedule. This is recommended
* for irregular workloads such as spmv.
* When using this for a new kernel op, add declaration and tuning objects to
* operator_tune.cc
* \tparam Args Varargs type to eventually pass to the OP::Map() function
* \param N Number of iterations
* \param args Varargs to eventually pass to the OP::Map() function
*/
template<typename ...Args>
inline static bool LaunchDynamic(mshadow::Stream<cpu> *, const int64_t N, Args... args) {
#ifdef _OPENMP
const int omp_threads = engine::OpenMP::Get()->GetRecommendedOMPThreadCount(false);
if (omp_threads < 2) {
for (int64_t i = 0; i < N; ++i) {
OP::Map(i, args...);
}
} else {
#pragma omp parallel for num_threads(omp_threads) schedule(dynamic)
for (int64_t i = 0; i < N; ++i) {
OP::Map(i, args...);
}
}
#else
for (int64_t i = 0; i < N; ++i) {
OP::Map(i, args...);
}
#endif
return true;
}
/*!
* \brief Launch CPU kernel which has OMP tuning data available.
* When using this for a new kernel op, add declaration and tuning objects to
* operator_tune.cc
* \tparam PRIMITIVE_OP The primitive operation to use for tuning
* \tparam DType Data type
* \tparam Args Varargs type to eventually pass to the OP::Map() function
* \param N Number of iterations
* \param dest Destination pointer (used to infer DType)
* \param args Varargs to eventually pass to the OP::Map() function
*/
template<typename PRIMITIVE_OP, typename DType, typename ...Args>
static void LaunchTuned(mshadow::Stream<cpu> *, const size_t N, Args... args) {
#ifdef _OPENMP
const int omp_threads = engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
if (omp_threads < 2 || !tuned_op<PRIMITIVE_OP, DType>::UseOMP(
N, static_cast<size_t>(omp_threads))) {
for (size_t i = 0; i < N; ++i) {
OP::Map(i, args...);
}
} else {
#pragma omp parallel for num_threads(omp_threads)
for (index_t i = 0; i < static_cast<index_t>(N); ++i) {
OP::Map(i, args...);
}
}
#else
for (size_t i = 0; i < N; ++i) {
OP::Map(i, args...);
}
#endif
}
/*!
* \brief Launch custom-tuned kernel where each thread is set to
* operate on a contiguous partition
* \tparam Args Varargs type to eventually pass to the OP::Map() function
* \param N Number of iterations
* \param args Varargs to eventually pass to the UseOMP() and OP::Map() functions
*/
template<typename ...Args>
inline static void LaunchEx(mshadow::Stream<cpu> *s, const size_t N, Args... args) {
#ifdef _OPENMP
const int omp_threads = engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
if (omp_threads < 2) {
OP::Map(0, N, args...);
} else {
const auto length = (N + omp_threads - 1) / omp_threads;
#pragma omp parallel for num_threads(omp_threads)
for (index_t i = 0; i < static_cast<index_t>(N); i += length) {
OP::Map(i, i + length > N ? N - i : length, args...);
}
}
#else
OP::Map(0, N, args...);
#endif
}
/*!
* \brief Launch a tunable OP with implicitly-supplied data type
* \tparam DType Data type
* \tparam T OP type
* \tparam Args Varargs type to eventually pass to the OP::Map() function
* \param s Stream (usually null for CPU)
* \param N Number of iterations
* \param args Varargs to eventually pass to the OP::Map() function
* \return Always true
*/
template<typename DType, typename T = OP, typename ...Args>
static MSHADOW_CINLINE
typename std::enable_if<std::is_base_of<tunable, T>::value, bool>::type
Launch(mshadow::Stream<cpu> *s, const size_t N, DType *dest, Args... args) {
LaunchTuned<T, DType>(s, N, dest, args...);
return true;
}
/*!
* \brief Launch a tunable OP wrapper with explicitly-supplied data type (ie op_with_req)
* \tparam DType Data type
* \tparam T Wrapper type
* \tparam Args Varargs type to eventually pass to the OP::Map() function
* \param s Stream (usually null for CPU)
* \param N Number of iterations
* \param args Varargs to eventually pass to the OP::Map() function
* \return Always true
*/
template<typename DType, typename T = OP, typename ...Args>
static MSHADOW_CINLINE
typename std::enable_if<std::is_base_of<tunable, typename T::Operation>::value, bool>::type
Launch(mshadow::Stream<cpu> *s, const size_t N, DType *dest, Args... args) {
LaunchTuned<typename T::Operation, DType>(s, N, dest, args...);
return true;
}
};
#ifdef __CUDACC__
template<typename OP, typename ...Args>
__global__ void mxnet_generic_kernel(int N, Args... args) {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {
OP::Map(i, args...);
}
}
template<typename OP, typename ...Args>
__global__ void mxnet_generic_kernel_ex(int N, Args... args) {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {
OP::Map(i, 1, args...);
}
}
template<typename OP>
struct Kernel<OP, gpu> {
/*! \brief Launch GPU kernel */
template<typename ...Args>
inline static void Launch(mshadow::Stream<gpu> *s, int N, Args... args) {
using namespace mshadow::cuda;
int ngrid = std::min(kMaxGridNum, (N + kBaseThreadNum - 1) / kBaseThreadNum);
mxnet_generic_kernel<OP, Args...>
<<<ngrid, kBaseThreadNum, 0, mshadow::Stream<gpu>::GetStream(s)>>>(
N, args...);
MSHADOW_CUDA_POST_KERNEL_CHECK(mxnet_generic_kernel);
}
template<typename ...Args>
inline static void LaunchEx(mshadow::Stream<gpu> *s, const int N, Args... args) {
using namespace mshadow::cuda;
int ngrid = std::min(kMaxGridNum, (N + kBaseThreadNum - 1) / kBaseThreadNum);
mxnet_generic_kernel_ex<OP, Args...>
<<<ngrid, kBaseThreadNum, 0, mshadow::Stream<gpu>::GetStream(s)>>>(
N, args...);
MSHADOW_CUDA_POST_KERNEL_CHECK(mxnet_generic_kernel_ex);
}
};
#endif // __CUDACC__
/*!
* \brief Set to immediate scalar value kernel
* \tparam val Scalar immediate
*/
template<int val>
struct set_to_int : public tunable {
// mxnet_op version (when used directly with Kernel<>::Launch()) */
template<typename DType>
MSHADOW_XINLINE static void Map(index_t i, DType *out) {
out[i] = DType(val);
}
// mshadow_op version (when used with op_with_req<>)
MSHADOW_XINLINE static int Map() {
return val;
}
};
/*!
* \brief Special-case kernel shortcut for setting to zero and one
*/
using set_zero = set_to_int<0>;
using set_one = set_to_int<1>;
} // namespace mxnet_op
} // namespace op
} // namespace mxnet
#endif // MXNET_OPERATOR_MXNET_OP_H_