blob: dfe572406b2b2d62a2e50eef0118dbb4e8d61cef [file] [log] [blame]
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef SINGA_CORE_TENSOR_TENSOR_MATH_CUDA_H_
#define SINGA_CORE_TENSOR_TENSOR_MATH_CUDA_H_
#include "singa/singa_config.h"
#ifdef USE_CUDA
#include "singa/core/tensor.h"
#include "./tensor_math.h"
#include "./math_kernel.h"
#include "singa/utils/cuda_utils.h"
#include "singa/core/common.h"
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include "singa/utils/cuda_utils.h"
#include <cudnn.h>
#define check_cudnn(expression) \
{ \
cudnnStatus_t status = (expression); \
if (status != CUDNN_STATUS_SUCCESS) { \
LOG(FATAL) << "Error on line " << __LINE__ << ": " \
<< cudnnGetErrorString(status) << " "; \
} \
}
namespace singa {
// ===================== Helper Functions =============================
/*
cudnn requires tensor dimensions to fulfill 1 requirement:
1.) Dimensions to be set to a minimum of 4 for 4d and lower dimensional tensors
if input tensor is 5d, cudnn will take a 5d tensor as input. Beyond 5d, certain operations are not supported.
(cudnnOp supports up to 5d, cudnnReduce supports up to 8d)
for e.g. Tensor A has shape {3,3}, cudnn requires shape of {1,1,3,3} to be the input
Tensor B has shape (2,3,4), cudnn requires shape of {1,2,3,4} to be the input
*/
vector<int> generate_shape_cuda(const Tensor& x) {
Shape shape = x.shape();
CHECK_LE(shape.size(), 5) << "Dimensions (shape) beyond 5 are currently not supported" ;
vector<int> shape_arr;
if (shape.size() <= 4) {
for (int n = 0; n < 4 - shape.size(); ++n) {
shape_arr.push_back(1);
}
}
for(auto x: shape)
shape_arr.push_back(static_cast<int>(x));
return shape_arr;
}
int generate_dim_cuda(const Tensor& x) {
CHECK_LE(x.nDim(), 5) << "Dimensions (shape) beyond 5 are currently not supported" ;
if (x.shape().size() <= 4) {return 4;}
else {return 5;}
}
/*
cudnn requires stride dimensions to conform to the format of the shape input as well
1.) Stride dimensions to be set to a minimum of 4 for 4d and lower dimensional tensors
If input tensor is 5d, cudnn will take a 5d tensor as input. Beyond 5d, certain operations are not supported.
(cudnnOp supports up to 5d, cudnnReduce supports up to 8d)
for e.g. Tensor A has shape {3,3}, stride {3,1}, cudnn requires shape {1,1,3,3}
and stride {9, 9, 3, 1} or {9, 9, 1, 3} to be the inputs
*/
vector<int> generate_strides_cuda(const Tensor& x) {
Shape shape = x.shape();
auto& strides = x.strides();
vector<int> strides_arr;
int product = Product(shape);
if (shape.size() <= 4) {
for (int n = 0; n < 4 - shape.size(); ++n) {
strides_arr.push_back(product);
}
}
for(auto x : strides)
strides_arr.push_back(static_cast<int>(x));
return strides_arr;
}
cudnnTensorDescriptor_t generate_tensor_nd_desc(const Tensor& x) {
cudnnTensorDescriptor_t x_desc;
check_cudnn(cudnnCreateTensorDescriptor(&x_desc));
check_cudnn(cudnnSetTensorNdDescriptor(x_desc, CUDNN_DATA_FLOAT,
generate_dim_cuda(x),
generate_shape_cuda(x).data(),
generate_strides_cuda(x).data()
));
return x_desc;
}
cudnnOpTensorDescriptor_t generate_op_desc(cudnnOpTensorOp_t op) {
cudnnOpTensorDescriptor_t op_desc;
check_cudnn(cudnnCreateOpTensorDescriptor(&op_desc));
check_cudnn(cudnnSetOpTensorDescriptor(op_desc, op,
CUDNN_DATA_FLOAT,
CUDNN_PROPAGATE_NAN
));
return op_desc;
}
// ===================== CUDA Functions =============================
/// out[i] = |in[i]|
template <>
void Abs<float, lang::Cuda>(const Tensor& in, Tensor* out,
Context* ctx) {
const float* inPtr = static_cast<const float*>(in.block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
float alpha1 = 1.0;
float alpha2 = -1.0;
float beta = 0.0;
cudnnTensorDescriptor_t in_desc = generate_tensor_nd_desc(in);
check_cudnn(cudnnOpTensor(ctx->cudnn_handle, generate_op_desc(CUDNN_OP_TENSOR_MAX),
(void*)(&alpha1), in_desc, inPtr,
(void*)(&alpha2), in_desc, inPtr,
(void*)(&beta), generate_tensor_nd_desc(*out), outPtr
));
cudnnDestroyTensorDescriptor(in_desc);
}
template <>
void Set<float, lang::Cuda>(const float x, Tensor* out,
Context* ctx) {
float* outPtr = static_cast<float*>(out->block()->mutable_data());
check_cudnn(cudnnSetTensor(ctx->cudnn_handle, generate_tensor_nd_desc(*out),
outPtr, (void*)(&x)));
}
template <>
void Add<float, lang::Cuda>(const Tensor& in, const float x,
Tensor* out, Context* ctx) {
Set<float, lang::Cuda>(x, out, ctx);
const float* inPtr = static_cast<const float*>(in.block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
float alpha = 1.0, beta = 1.0;
check_cudnn(cudnnAddTensor(ctx->cudnn_handle,
(void*)(&alpha), generate_tensor_nd_desc(in), inPtr,
(void*)(&beta), generate_tensor_nd_desc(*out), outPtr
));
}
/// out = in1 + in2
template <>
void Add<float, lang::Cuda>(const Tensor& in1,
const Tensor& in2, Tensor* out, Context* ctx) {
const float* inPtr1 = static_cast<const float*>(in1.block()->data());
const float* inPtr2 = static_cast<const float*>(in2.block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
float alpha1 = 1.0;
float alpha2 = 1.0;
float beta = 0.0;
if ((in1.nDim() == in2.nDim()) || (in2.nDim() == 1)) {
check_cudnn(cudnnOpTensor(ctx->cudnn_handle, generate_op_desc(CUDNN_OP_TENSOR_ADD),
(void*)(&alpha1), generate_tensor_nd_desc(in1), inPtr1,
(void*)(&alpha2), generate_tensor_nd_desc(in2), inPtr2,
(void*)(&beta), generate_tensor_nd_desc(*out), outPtr
));
} else {
check_cudnn(cudnnOpTensor(ctx->cudnn_handle, generate_op_desc(CUDNN_OP_TENSOR_ADD),
(void*)(&alpha1), generate_tensor_nd_desc(in1), inPtr1,
(void*)(&alpha2), generate_tensor_nd_desc(in1), inPtr2,
(void*)(&beta), generate_tensor_nd_desc(*out), outPtr
));
}
}
/// out = in1 - in2
template <>
void Sub<float, lang::Cuda>(const Tensor& in1,
const Tensor& in2, Tensor* out, Context* ctx) {
const float* inPtr1 = static_cast<const float*>(in1.block()->data());
const float* inPtr2 = static_cast<const float*>(in2.block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
float alpha1 = 1.0;
float alpha2 = -1.0;
float beta = 0.0;
if ((in1.nDim() == in2.nDim()) || (in2.nDim() == 1)) {
check_cudnn(cudnnOpTensor(ctx->cudnn_handle, generate_op_desc(CUDNN_OP_TENSOR_ADD),
(void*)(&alpha1), generate_tensor_nd_desc(in1), inPtr1,
(void*)(&alpha2), generate_tensor_nd_desc(in2), inPtr2,
(void*)(&beta), generate_tensor_nd_desc(*out), outPtr
));
} else {
check_cudnn(cudnnOpTensor(ctx->cudnn_handle, generate_op_desc(CUDNN_OP_TENSOR_ADD),
(void*)(&alpha1), generate_tensor_nd_desc(in1), inPtr1,
(void*)(&alpha2), generate_tensor_nd_desc(in1), inPtr2,
(void*)(&beta), generate_tensor_nd_desc(*out), outPtr
));
}
}
template <>
void Transform<float, lang::Cuda>(const Tensor& in, Tensor* out,
Context* ctx) {
const float* inPtr = static_cast<const float*>(in.block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
float alpha = 1.0;
float beta = 0.0;
check_cudnn(cudnnTransformTensor(ctx->cudnn_handle,
(void*)(&alpha), generate_tensor_nd_desc(in), inPtr,
(void*)(&beta), generate_tensor_nd_desc(*out), outPtr
));
}
/// Element-wise operation, clamp every element into [low, high]
/// if x>high, then x=high; if x<low, then x=low.
template <>
void Clamp<float, lang::Cuda>(const float low,
const float high, const Tensor& in, Tensor* out,
Context* ctx) {
const float* inPtr = static_cast<const float*>(in.block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
const size_t num = in.Size();
//if both in and out strides are the same, we proceed to normal cuda::clamp
if (in.strides() == out->strides()) {
cuda::clamp(num, low, high, inPtr, outPtr, ctx->stream);
} else { //else we transform in to out to store first
Transform<float, lang::Cuda>(in, out, ctx);
cuda::clamp(num, low, high, outPtr, outPtr, ctx->stream);
}
}
/// out = in1 / in2
template <>
void Div<float, lang::Cuda>(const Tensor& in1,
const Tensor& in2, Tensor* out, Context* ctx) {
const float* inPtr1 = static_cast<const float*>(in1.block()->data());
const float* inPtr2 = static_cast<const float*>(in2.block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
const size_t num = in1.Size();
//if both in1 and in2 are not transposed, and have the same strides,
//we proceed to normal cuda::div
if (!in1.transpose() && !in2.transpose() && (in1.strides() == in2.strides())) {
cuda::div(num, inPtr1, inPtr2, outPtr, ctx->stream);
} else { //else we check whether in1 or in2 or both are transposed
if (in1.transpose() && in2.transpose()) {
Tensor t(in1.shape(), in1.device(), in1.data_type());
Transform<float, lang::Cuda>(in1, &t, ctx);
Transform<float, lang::Cuda>(in2, out, ctx);
float* tPtr = static_cast<float*>(t.block()->mutable_data());
cuda::div(num, tPtr, outPtr, outPtr, ctx->stream);
} else if (in1.transpose()) {
Transform<float, lang::Cuda>(in1, out, ctx);
cuda::div(num, outPtr, inPtr2, outPtr, ctx->stream);
} else if (in2.transpose()) {
Transform<float, lang::Cuda>(in2, out, ctx);
cuda::div(num, inPtr1, outPtr, outPtr, ctx->stream);
}
}
}
template <>
void Div<float, lang::Cuda>(const float x, const Tensor& in,
Tensor* out, Context* ctx) {
const float* inPtr = static_cast<const float*>(in.block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
const size_t num = in.Size();
if (in.strides() == out->strides()) {
cuda::div(num, x, inPtr, outPtr, ctx->stream);
} else { //else we transform in to out to store first
Transform<float, lang::Cuda>(in, out, ctx);
cuda::div(num, x, outPtr, outPtr, ctx->stream);
}
}
/// out = in * x
template <>
void EltwiseMult<float, lang::Cuda>(const Tensor& in,
const float x, Tensor* out, Context* ctx) {
const float* inPtr = static_cast<const float*>(in.block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
float alpha = x, beta = 0.0;
check_cudnn(cudnnAddTensor(ctx->cudnn_handle,
(void*)(&alpha), generate_tensor_nd_desc(in), inPtr,
(void*)(&beta), generate_tensor_nd_desc(*out), outPtr
));
}
/// out = in1 * in2
template <>
void EltwiseMult<float, lang::Cuda>(const Tensor& in1,
const Tensor& in2, Tensor* out,
Context* ctx) {
const float* inPtr1 = static_cast<const float*>(in1.block()->data());
const float* inPtr2 = static_cast<const float*>(in2.block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
const size_t num = in1.Size();
//if both in1 and in2 are not transposed, and have the same strides,
//we proceed to normal cuda::mult
if (!in1.transpose() && !in2.transpose() && (in1.strides() == in2.strides())) {
cuda::mult(num, inPtr1, inPtr2, outPtr, ctx->stream);
} else { //else we check whether in1 or in2 or both are transposed
if (in1.transpose() && in2.transpose()) {
Tensor t(in1.shape(), in1.device(), in1.data_type());
Transform<float, lang::Cuda>(in1, &t, ctx);
Transform<float, lang::Cuda>(in2, out, ctx);
float* tPtr = static_cast<float*>(t.block()->mutable_data());
cuda::mult(num, tPtr, outPtr, outPtr, ctx->stream);
} else if (in1.transpose()) {
Transform<float, lang::Cuda>(in1, out, ctx);
cuda::mult(num, outPtr, inPtr2, outPtr, ctx->stream);
} else if (in2.transpose()) {
Transform<float, lang::Cuda>(in2, out, ctx);
cuda::mult(num, inPtr1, outPtr, outPtr, ctx->stream);
}
}
}
/// Base is e. out[i]=e^in[i]
template <>
void Exp<float, lang::Cuda>(const Tensor& in, Tensor* out,
Context* ctx) {
const float* inPtr = static_cast<const float*>(in.block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
const size_t num = in.Size();
if (in.strides() == out->strides()) {
cuda::exp(num, inPtr, outPtr, ctx->stream);
} else { //else we transform in to out to store first
Transform<float, lang::Cuda>(in, out, ctx);
cuda::exp(num, outPtr, outPtr, ctx->stream);
}
}
template <>
void GE<float, lang::Cuda>(const Tensor& in, const float x,
Tensor* out, Context* ctx) {
float* outPtr = static_cast<float*>(out->block()->mutable_data());
const float* inPtr = static_cast<const float*>(in.block()->data());
const size_t num = in.Size();
if (in.strides() == out->strides()) {
cuda::ge(num, inPtr, x, outPtr, ctx->stream);
} else { //else we transform in to out to store first
Transform<float, lang::Cuda>(in, out, ctx);
cuda::ge(num, outPtr, x, outPtr, ctx->stream);
}
}
template <>
void GE<float, lang::Cuda>(const Tensor& in1, const Tensor& in2,
Tensor* out, Context* ctx) {
Sub<float, lang::Cuda>(in1, in2, out, ctx);
float* outPtr = static_cast<float*>(out->block()->mutable_data());
const size_t num = in1.Size();
cuda::ge(num, outPtr, 0.0, outPtr, ctx->stream);
}
template <>
void GT<float, lang::Cuda>(const Tensor& in, const float x,
Tensor* out, Context* ctx) {
float* outPtr = static_cast<float*>(out->block()->mutable_data());
const float* inPtr = static_cast<const float*>(in.block()->data());
const size_t num = in.Size();
if (in.strides() == out->strides()) {
cuda::gt(num, inPtr, x, outPtr, ctx->stream);
} else { //else we transform in to out to store first
Transform<float, lang::Cuda>(in, out, ctx);
cuda::gt(num, outPtr, x, outPtr, ctx->stream);
}
}
template <>
void GT<float, lang::Cuda>(const Tensor& in1, const Tensor& in2,
Tensor* out, Context* ctx) {
Sub<float, lang::Cuda>(in1, in2, out, ctx);
float* outPtr = static_cast<float*>(out->block()->mutable_data());
const size_t num = in1.Size();
cuda::gt(num, outPtr, 0.0, outPtr, ctx->stream);
}
template <>
void LE<float, lang::Cuda>(const Tensor& in, const float x,
Tensor* out, Context* ctx) {
float* outPtr = static_cast<float*>(out->block()->mutable_data());
const float* inPtr = static_cast<const float*>(in.block()->data());
const size_t num = in.Size();
if (in.strides() == out->strides()) {
cuda::le(num, inPtr, x, outPtr, ctx->stream);
} else { //else we transform in to out to store first
Transform<float, lang::Cuda>(in, out, ctx);
cuda::le(num, outPtr, x, outPtr, ctx->stream);
}
}
template <>
void LE<float, lang::Cuda>(const Tensor& in1, const Tensor& in2,
Tensor* out, Context* ctx) {
Sub<float, lang::Cuda>(in1, in2, out, ctx);
float* outPtr = static_cast<float*>(out->block()->mutable_data());
const size_t num = in1.Size();
cuda::le(num, outPtr, 0.0, outPtr, ctx->stream);
}
/// Natual logarithm, the base is e, Neper number out[i]=ln(in[i]).
template <>
void Log<float, lang::Cuda>(const Tensor& in, Tensor* out,
Context* ctx) {
const float* inPtr = static_cast<const float*>(in.block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
const size_t num = in.Size();
if (in.strides() == out->strides()) {
cuda::log(num, inPtr, outPtr, ctx->stream);
} else { //else we transform in to out to store first
Transform<float, lang::Cuda>(in, out, ctx);
cuda::log(num, outPtr, outPtr, ctx->stream);
}
}
template <>
void LT<float, lang::Cuda>(const Tensor& in, const float x,
Tensor* out, Context* ctx) {
float* outPtr = static_cast<float*>(out->block()->mutable_data());
const float* inPtr = static_cast<const float*>(in.block()->data());
const size_t num = in.Size();
if (in.strides() == out->strides()) {
cuda::lt(num, inPtr, x, outPtr, ctx->stream);
} else { //else we transform in to out to store first
Transform<float, lang::Cuda>(in, out, ctx);
cuda::lt(num, outPtr, x, outPtr, ctx->stream);
}
}
template <>
void LT<float, lang::Cuda>(const Tensor& in1, const Tensor& in2,
Tensor* out, Context* ctx) {
Sub<float, lang::Cuda>(in1, in2, out, ctx);
float* outPtr = static_cast<float*>(out->block()->mutable_data());
const size_t num = in1.Size();
cuda::lt(num, outPtr, 0.0, outPtr, ctx->stream);
}
/// Element-wise operation, out[i] = in[i]^x
template <>
void Pow<float, lang::Cuda>(const Tensor& in, const float x,
Tensor* out, Context* ctx) {
const float* inPtr = static_cast<const float*>(in.block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
const size_t num = in.Size();
if (in.strides() == out->strides()) {
cuda::pow(num, inPtr, x, outPtr, ctx->stream);
} else { //else we transform in to out to store first
Transform<float, lang::Cuda>(in, out, ctx);
cuda::pow(num, outPtr, x, outPtr, ctx->stream);
}
}
/// Element-wise operation, out[i] = in1[i]^in2[i]
template <>
void Pow<float, lang::Cuda>(const Tensor& in1,
const Tensor& in2, Tensor* out, Context* ctx) {
const float* inPtr1 = static_cast<const float*>(in1.block()->data());
const float* inPtr2 = static_cast<const float*>(in2.block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
const size_t num = in1.Size();
//if both in1 and in2 are not transposed, and have the same strides,
//we proceed to normal cuda::pow
if (!in1.transpose() && !in2.transpose() && (in1.strides() == in2.strides())) {
cuda::pow(num, inPtr1, inPtr2, outPtr, ctx->stream);
} else { //else we check whether in1 or in2 or both are transposed
if (in1.transpose() && in2.transpose()) {
Tensor t(in1.shape(), in1.device(), in1.data_type());
float* tPtr = static_cast<float*>(t.block()->mutable_data());
Transform<float, lang::Cuda>(in1, &t, ctx);
Transform<float, lang::Cuda>(in2, out, ctx);
cuda::pow(num, tPtr, outPtr, outPtr, ctx->stream);
} else if (in1.transpose()) {
Transform<float, lang::Cuda>(in1, out, ctx);
cuda::pow(num, outPtr, inPtr2, outPtr, ctx->stream);
} else if (in2.transpose()) {
Transform<float, lang::Cuda>(in2, out, ctx);
cuda::pow(num, inPtr1, outPtr, outPtr, ctx->stream);
}
}
}
/// Element-wise operation, out[i]=max(0, in[i])
// template <>
// void ReLU<float, lang::Cuda>(const Tensor& in, Tensor* out,
// Context* ctx) {
// const float* inPtr = static_cast<const float*>(in.block()->data());
// float* outPtr = static_cast<float*>(out->block()->mutable_data());
// cudnnActivationDescriptor_t act_desc;
// cudnnActivationMode_t mode = CUDNN_ACTIVATION_RELU;
// cudnnNanPropagation_t cudnn_propagation = CUDNN_PROPAGATE_NAN;
// double coef = 0.0; //only used for CLIPPED_RELU or ELU
// cudnnCreateActivationDescriptor(&act_desc);
// cudnnSetActivationDescriptor(act_desc, mode, cudnn_propagation, coef);
// float alpha[1] = {1.0};
// float beta[1] = {0.0};
// cudnnDataType_t cudnn_dtype = CUDNN_DATA_FLOAT;
// cudnnTensorDescriptor_t in_desc, out_desc;
// cudnnCreateTensorDescriptor(&in_desc);
// cudnnCreateTensorDescriptor(&out_desc);
// cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in.generate_dim_cuda(),
// in.generate_shape_cuda().data(), in.generate_strides_cuda().data());
// cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(),
// out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
// cudnnActivationForward(ctx->cudnn_handle, act_desc, (void*)(&alpha), in_desc, inPtr,
// (void*)(&beta), out_desc, outPtr);
// cudnnDestroyTensorDescriptor(in_desc);
// cudnnDestroyTensorDescriptor(out_desc);
// cudnnDestroyActivationDescriptor(act_desc);
// }
template <>
void ReLU<float, lang::Cuda>(const Tensor& in, Tensor* out,
Context* ctx) {
const float* inPtr = static_cast<const float*>(in.block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
const size_t num = in.Size();
if (in.strides() == out->strides()) {
cuda::relu(num, inPtr, outPtr, ctx->stream);
} else { //else we transform in to out to store first
Transform<float, lang::Cuda>(in, out, ctx);
cuda::relu(num, outPtr, outPtr, ctx->stream);
}
}
// /// Element-wise operation, out[i]=sigmoid([in[i])
// template <>
// void Sigmoid<float, lang::Cuda>(const Tensor& in, Tensor* out,
// Context* ctx) {
// const float* inPtr = static_cast<const float*>(in.block()->data());
// float* outPtr = static_cast<float*>(out->block()->mutable_data());
// cudnnActivationDescriptor_t act_desc;
// cudnnActivationMode_t mode = CUDNN_ACTIVATION_SIGMOID;
// cudnnNanPropagation_t cudnn_propagation = CUDNN_PROPAGATE_NAN;
// double coef = 0.0; //only used for CLIPPED_RELU or ELU
// cudnnCreateActivationDescriptor(&act_desc);
// cudnnSetActivationDescriptor(act_desc, mode, cudnn_propagation, coef);
// float alpha[1] = {1.0};
// float beta[1] = {0.0};
// cudnnDataType_t cudnn_dtype = CUDNN_DATA_FLOAT;
// cudnnTensorDescriptor_t in_desc, out_desc;
// cudnnCreateTensorDescriptor(&in_desc);
// cudnnCreateTensorDescriptor(&out_desc);
// cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in.generate_dim_cuda(),
// in.generate_shape_cuda().data(), in.generate_strides_cuda().data());
// cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(),
// out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
// cudnnActivationForward(ctx->cudnn_handle, act_desc, (void*)(&alpha), in_desc, inPtr,
// (void*)(&beta), out_desc, outPtr);
// cudnnDestroyTensorDescriptor(in_desc);
// cudnnDestroyTensorDescriptor(out_desc);
// cudnnDestroyActivationDescriptor(act_desc);
// }
/// Element-wise operation, out[i]=sigmoid([in[i])
template <>
void Sigmoid<float, lang::Cuda>(const Tensor& in, Tensor* out,
Context* ctx) {
const float* inPtr = static_cast<const float*>(in.block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
const size_t num = in.Size();
if (in.strides() == out->strides()) {
cuda::sigmoid(num, inPtr, outPtr, ctx->stream);
} else { //else we transform in to out to store first
Transform<float, lang::Cuda>(in, out, ctx);
cuda::sigmoid(num, outPtr, outPtr, ctx->stream);
}
}
// out[i] = sign(in[i])
template <>
void Sign<float, lang::Cuda>(const Tensor& in, Tensor* out,
Context* ctx) {
const float* inPtr = static_cast<const float*>(in.block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
const size_t num = in.Size();
if (in.strides() == out->strides()) {
cuda::sign(num, inPtr, outPtr, ctx->stream);
} else { //else we transform in to out to store first
Transform<float, lang::Cuda>(in, out, ctx);
cuda::sign(num, outPtr, outPtr, ctx->stream);
}
}
// Element-wise operation, out[i]=sqrt([in[i])
template <>
void Sqrt<float, lang::Cuda>(const Tensor& in, Tensor* out,
Context* ctx) {
float* outPtr = static_cast<float*>(out->block()->mutable_data());
#if CUDNN_MAJOR < 7
Transform<float, lang::Cuda>(in, out, ctx);
size_t num = in.Size();
cuda::sqrt(num, outPtr, outPtr, ctx->stream);
#else
const float* inPtr = static_cast<const float*>(in.block()->data());
float alpha1 = 1.0;
float alpha2 = 0.0;
float beta = 0.0;
cudnnTensorDescriptor_t in_desc = generate_tensor_nd_desc(in);
check_cudnn(cudnnOpTensor(ctx->cudnn_handle, generate_op_desc(CUDNN_OP_TENSOR_SQRT),
(void*)(&alpha1), in_desc, inPtr,
(void*)(&alpha2), in_desc, inPtr,
(void*)(&beta), generate_tensor_nd_desc(*out), outPtr
));
#endif // CUDNN_MAJOR < 7
}
/// Element-wise operation, out[i]=in[i]^2
template <>
void Square<float, lang::Cuda>(const Tensor& in, Tensor* out,
Context* ctx) {
const float* inPtr = static_cast<const float*>(in.block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
const size_t num = in.Size();
if (in.strides() == out->strides()) {
cuda::square(num, inPtr, outPtr, ctx->stream);
} else { //else we transform in to out to store first
Transform<float, lang::Cuda>(in, out, ctx);
cuda::square(num, outPtr, outPtr, ctx->stream);
}
}
// template <>
// void Sum<float, lang::Cuda>(const size_t num, const Block* in, float* out,
// Context* ctx) {
// LOG(FATAL) << "Cuda Sum is not implemented!";
// // const float* inPtr = static_cast<const float*>(in.data());
// // cuda::sum(num, inPtr, out, ctx->stream);
// }
/// Element-wise operation, out[i]=tanh([in[i])
// template <>
// void Tanh<float, lang::Cuda>(const Tensor& in, Tensor* out,
// Context* ctx) {
// const float* inPtr = static_cast<const float*>(in.block()->data());
// float* outPtr = static_cast<float*>(out->block()->mutable_data());
// cudnnActivationDescriptor_t act_desc;
// cudnnActivationMode_t mode = CUDNN_ACTIVATION_TANH;
// cudnnNanPropagation_t cudnn_propagation = CUDNN_PROPAGATE_NAN;
// double coef = 0.0; //only used for CLIPPED_RELU or ELU
// cudnnCreateActivationDescriptor(&act_desc);
// cudnnSetActivationDescriptor(act_desc, mode, cudnn_propagation, coef);
// float alpha[1] = {1.0};
// float beta[1] = {0.0};
// cudnnDataType_t cudnn_dtype = CUDNN_DATA_FLOAT;
// cudnnTensorDescriptor_t in_desc, out_desc;
// cudnnCreateTensorDescriptor(&in_desc);
// cudnnCreateTensorDescriptor(&out_desc);
// cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in.generate_dim_cuda(),
// in.generate_shape_cuda().data(), in.generate_strides_cuda().data());
// cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(),
// out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
// cudnnActivationForward(ctx->cudnn_handle, act_desc, (void*)(&alpha), in_desc, inPtr,
// (void*)(&beta), out_desc, outPtr);
// cudnnDestroyTensorDescriptor(in_desc);
// cudnnDestroyTensorDescriptor(out_desc);
// cudnnDestroyActivationDescriptor(act_desc);
// }
template <>
void Tanh<float, lang::Cuda>(const Tensor& in, Tensor* out,
Context* ctx) {
const float* inPtr = static_cast<const float*>(in.block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
const size_t num = in.Size();
if (in.strides() == out->strides()) {
cuda::tanh(num, inPtr, outPtr, ctx->stream);
} else { //else we transform in to out to store first
Transform<float, lang::Cuda>(in, out, ctx);
cuda::tanh(num, outPtr, outPtr, ctx->stream);
}
}
// ================Random functions===========================================
/// Each element of out would be 1 with prob p and 0 with 1-p. 0<= p <= 1
// Get the random generator from 'ctx'
// If DType is not float, then convert the threshold to DType
template <>
void Bernoulli<float, lang::Cuda>(const float p, Tensor* out,
Context* ctx) {
auto rgen = ctx->curand_generator;
float* outPtr = static_cast<float*>(out->block()->mutable_data());
const size_t num = out->Size();
CURAND_CHECK(curandGenerateUniform(rgen, outPtr, num));
cuda::threshold(num, p, outPtr, outPtr, ctx->stream);
}
// The random generator should be extracted from ctx.
// If DType is not float, then convert the low and high to DType
template <>
void Uniform<float, lang::Cuda>(const float low,
const float high, Tensor* out, Context* ctx) {
auto rgen = ctx->curand_generator;
float* outPtr = static_cast<float*>(out->block()->mutable_data());
const size_t num = out->Size();
CURAND_CHECK(curandGenerateUniform(rgen, outPtr, num));
cuda::mult(num, outPtr, high - low, outPtr, ctx->stream);
cuda::add(num, outPtr, low, outPtr, ctx->stream);
}
// The random generator should be extracted from ctx.
// If DType is not float, then convert the mean and delta to DType
template <>
void Gaussian<float, lang::Cuda>(const float mean,
const float std, Tensor* out, Context* ctx) {
auto rgen = ctx->curand_generator;
float* outPtr = static_cast<float*>(out->block()->mutable_data());
const size_t num = out->Size();
CURAND_CHECK(curandGenerateNormal(rgen, outPtr, num, mean, std));
}
// =========================Blas operations==================================
// ref to http://docs.nvidia.com/cuda/cublas
template <>
void Amax<float, lang::Cuda>(const Tensor& in, size_t* out,
Context* ctx) {
const float* inPtr = static_cast<const float*>(in.block()->data());
auto handle = ctx->cublas_handle; // TODO(wangwei) set cudastream
int idx = 1;
const size_t num = in.Size();
CUBLAS_CHECK(cublasIsamax(handle, num, inPtr, 1, &idx));
*out = idx - 1; // cublas index starts from 1
}
/// return the index of the element with the min value.
template <>
void Amin<float, lang::Cuda>(const Tensor& in, size_t* out,
Context* ctx) {
const float* inPtr = static_cast<const float*>(in.block()->data());
auto handle = ctx->cublas_handle; // TODO(wangwei) set cudastream
int idx = 1;
const size_t num = in.Size();
CUBLAS_CHECK(cublasIsamin(handle, num, inPtr, 1, &idx));
*out = idx - 1;
}
/// out = sum |x| for all x in in
template <>
void Asum<float, lang::Cuda>(const Tensor& in, float* out,
Context* ctx) {
const float* inPtr = static_cast<const float*>(in.block()->data());
auto handle = ctx->cublas_handle; // TODO(wangwei) set cudastream
const size_t num = in.Size();
CUBLAS_CHECK(cublasSasum(handle, num, inPtr, 1, out));
}
/// out = alpha * in + out
template <>
void Axpy<float, lang::Cuda>(const float alpha,
const Tensor& in, Tensor* out, Context* ctx) {
const float* inPtr = static_cast<const float*>(in.block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
auto handle = ctx->cublas_handle; // TODO(wangwei) set cudastream
const size_t num = in.Size();
CUBLAS_CHECK(cublasSaxpy(handle, num, &alpha, inPtr, 1, outPtr, 1));
}
/// out = \sum_i in1[i] * in2[i]
template <>
void Dot<float, lang::Cuda>(const Tensor& in1,
const Tensor& in2, float* out, Context* ctx) {
const float* inPtr1 = static_cast<const float*>(in1.block()->data());
const float* inPtr2 = static_cast<const float*>(in2.block()->data());
auto handle = ctx->cublas_handle; // TODO(wangwei) set cudastream
const size_t num = in1.Size();
CUBLAS_CHECK(cublasSdot(handle, num, inPtr1, 1, inPtr2, 1, out));
}
template <>
void Nrm2<float, lang::Cuda>(const Tensor& in, float* out,
Context* ctx) {
auto handle = ctx->cublas_handle; // TODO(wangwei) set cudastream
const float* inPtr = static_cast<const float*>(in.block()->data());
const size_t num = in.Size();
cublasSnrm2(handle, num, inPtr, 1, out);
}
template <>
void Scale<float, lang::Cuda>(const float x, Tensor* out,
Context* ctx) {
auto handle = ctx->cublas_handle; // TODO(wangwei) set cudastream
float* outPtr = static_cast<float*>(out->block()->mutable_data());
const size_t num = out->Size();
CUBLAS_CHECK(cublasSscal(handle, num, &x, outPtr, 1));
}
// NOTE: cublas uses column major order.
// http://peterwittek.com/cublas-matrix-c-style.html
template <>
void DGMM<float, lang::Cuda>(const bool side_right, const Tensor& M, const Tensor& v,
Tensor* out, Context* ctx) {
auto handle = ctx->cublas_handle; // TODO(wangwei) set cudastream
const float* MPtr = static_cast<const float*>(M.block()->data());
const float* vPtr = static_cast<const float*>(v.block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
const size_t nrow = M.shape(0);
const size_t ncol = M.shape(1);
if (side_right) {
CUBLAS_CHECK(cublasSdgmm(handle, CUBLAS_SIDE_LEFT, ncol, nrow, MPtr, ncol,
vPtr, 1, outPtr, ncol));
} else {
CUBLAS_CHECK(cublasSdgmm(handle, CUBLAS_SIDE_RIGHT, ncol, nrow, MPtr, ncol,
vPtr, 1, outPtr, ncol));
}
}
template <>
void GEMV<float, lang::Cuda>(const float alpha, const Tensor& A, const Tensor& v,
const float beta, Tensor* out, Context* ctx) {
const float* APtr = static_cast<const float*>(A.block()->data());
const float* vPtr = static_cast<const float*>(v.block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
const size_t m = A.shape()[0];
const size_t n = A.shape()[1];
auto handle = ctx->cublas_handle; // TODO(wangwei) set cudastream
if (!(A.transpose()))
CUBLAS_CHECK(cublasSgemv(handle, CUBLAS_OP_T, n, m, &alpha, APtr, n, vPtr,
1, &beta, outPtr, 1));
else
CUBLAS_CHECK(cublasSgemv(handle, CUBLAS_OP_N, m, n, &alpha, APtr, m, vPtr,
1, &beta, outPtr, 1));
}
// http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm
template <>
void GEMM<float, lang::Cuda>(const float alpha,
const Tensor& A, const Tensor& B, const float beta,
Tensor* C, Context* ctx) {
auto transA = A.transpose();
auto transa = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
auto transB = B.transpose();
auto transb = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
const size_t nrowA = A.shape()[0];
const size_t ncolA = A.shape()[1];
const size_t ncolB = B.shape()[1];
int lda = transA ? nrowA : ncolA;
int ldb = transB ? ncolA : ncolB;
int ldc = ncolB;
const float* APtr = static_cast<const float*>(A.block()->data());
const float* BPtr = static_cast<const float*>(B.block()->data());
float* CPtr = static_cast<float*>(C->block()->mutable_data());
auto handle = ctx->cublas_handle; // TODO(wangwei) set cudastream
CUBLAS_CHECK(cublasSgemm(handle, transb, transa, ncolB, nrowA, ncolA, &alpha,
BPtr, ldb, APtr, lda, &beta, CPtr, ldc));
}
template <>
void ComputeCrossEntropy<float, lang::Cuda>(bool int_target,
const size_t batchsize,
const size_t dim, const Block* p,
const Block* t, Block* loss,
Context* ctx) {
const float* pPtr = static_cast<const float*>(p->data());
const int* tPtr = static_cast<const int*>(t->data());
float* lossPtr = static_cast<float*>(loss->mutable_data());
cuda::ComputeCrossEntropy(int_target, batchsize, dim, pPtr, tPtr, lossPtr,
ctx->stream);
}
template <>
void SoftmaxCrossEntropyBwd<float, lang::Cuda>(bool int_target,
const size_t batchsize,
const size_t dim, const Block* p,
const Block* t, Block* grad,
Context* ctx) {
CHECK_EQ(p, grad) << "Use the same pointer to optimize performance";
const float* pPtr = static_cast<const float*>(p->data());
const int* tPtr = static_cast<const int*>(t->data());
float* gradPtr = static_cast<float*>(grad->mutable_data());
cuda::SoftmaxCrossEntropyBwd(int_target, batchsize, dim, pPtr, tPtr, gradPtr,
ctx->stream);
}
// template <>
// void RowMax<float, lang::Cuda>(const Tensor& in, Tensor* out,
// Context* ctx) {
// const float* inPtr = static_cast<const float*>(in.block()->data());
// float* outPtr = static_cast<float*>(out->block()->mutable_data());
// // const size_t nrow = in.shape()[0];
// // const size_t ncol = in.shape()[1];
// // cuda::RowMax(nrow, ncol, inPtr, outPtr, ctx->stream);
// //vector<int> reduce_row_axes_shape = in.generate_shape_cuda();
// //reduce_row_axes_shape.back() = 1; //reduce axis 1, so we set last element d in shape {a,b,c,d} to 1
// vector<int> reduce_row_axes_shape = {1,1,1,1};
// vector<int> reduced_strides = {1,1,1,1};
// //reduce_desc
// cudnnReduceTensorDescriptor_t reduce_desc;
// cudnnReduceTensorOp_t reduce_op = CUDNN_REDUCE_TENSOR_ADD;
// cudnnDataType_t cudnn_dtype = CUDNN_DATA_FLOAT;
// cudnnNanPropagation_t cudnn_propagation = CUDNN_PROPAGATE_NAN;
// cudnnReduceTensorIndices_t cudnn_indices = CUDNN_REDUCE_TENSOR_NO_INDICES;
// //cudnnReduceTensorIndices_t cudnn_indices = CUDNN_REDUCE_TENSOR_FLATTENED_INDICES;
// cudnnIndicesType_t cudnn_indices_type = CUDNN_32BIT_INDICES;
// cudnnCreateReduceTensorDescriptor(&reduce_desc);
// cudnnSetReduceTensorDescriptor(reduce_desc, reduce_op, cudnn_dtype,
// cudnn_propagation, cudnn_indices, cudnn_indices_type);
// //instantiate new tensor to use new blocks as memory instead of cudaMalloc
// //create 2 tensors of same size as input tensor
// Shape reduction_size = {1000};
// Tensor indices(reduction_size, in.device(), in.data_type());
// Tensor workspace(reduction_size, in.device(), in.data_type());
// size_t indices_bytes = indices.block()->size()*1000;
// size_t workspace_bytes = workspace.block()->size()*1000;
// size_t* indicesPtr = static_cast<size_t*>(indices.block()->mutable_data());
// float* workspacePtr = static_cast<float*>(workspace.block()->mutable_data());
// //void* indicesPtr{nullptr}; void* workspacePtr{nullptr};
// //cudaMalloc(&indicesPtr, indices_bytes); cudaMalloc(&workspacePtr, workspace_bytes);
// float alpha[1] = {1.0};
// float beta[1] = {0.0};
// cudnnTensorDescriptor_t in_desc, out_desc;
// cudnnCreateTensorDescriptor(&in_desc);
// cudnnCreateTensorDescriptor(&out_desc);
// cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in.generate_dim_cuda(),
// in.generate_shape_cuda().data(), in.generate_strides_cuda().data());
// //cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(),
// out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
// cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(),
// reduce_row_axes_shape.data(), reduced_strides.data());
// cudnnReduceTensor(ctx->cudnn_handle, reduce_desc,
// indicesPtr, indices_bytes, workspacePtr, workspace_bytes,
// (void*)(&alpha), in_desc, inPtr, (void*)(&beta), out_desc, outPtr);
// cudnnDestroyTensorDescriptor(in_desc);
// cudnnDestroyTensorDescriptor(out_desc);
// }
template <>
void RowMax<float, lang::Cuda>(const Tensor& in, Tensor* out,
Context* ctx) {
const float* inPtr = static_cast<const float*>(in.block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
const size_t nrow = in.shape()[0];
const size_t ncol = in.shape()[1];
if (in.transpose()) {
Tensor t(in.shape(), in.device(), in.data_type());
Transform<float, lang::Cuda>(in, &t, ctx);
const float* tPtr_const = static_cast<const float*>(t.block()->data());
cuda::RowMax(nrow, ncol, tPtr_const, outPtr, ctx->stream);
} else {
cuda::RowMax(nrow, ncol, inPtr, outPtr, ctx->stream);
}
}
// must put this function after Set and Dot functions due to the error from
// instantiation before specialization
template <>
void Sum<float, lang::Cuda>(const Tensor& in, float* out,
Context* ctx) {
#if CUDNN_MAJOR < 7
Tensor one(in.shape(), in.device(), in.data_type());
Set<float, lang::Cuda>(float(1), &one, ctx);
Dot<float, lang::Cuda>(in, one, out, ctx);
#else
const float* inPtr = static_cast<const float*>(in.block()->data());
//reduce all axes to 1 for cudnnReduce, e.g. Tensor A with shape (2,4) will be reduced to (1)
Shape reduced_shape = {1};
Tensor t(reduced_shape, in.device(), in.data_type());
float* tPtr = static_cast<float*>(t.block()->mutable_data());
vector<int> reduce_all_axes = generate_shape_cuda(in);
for (size_t n = 0; n < reduce_all_axes.size(); ++n) {
reduce_all_axes[n] = 1;
}
//reduce_desc
cudnnReduceTensorDescriptor_t reduce_desc;
cudnnReduceTensorOp_t reduce_op = CUDNN_REDUCE_TENSOR_ADD;
cudnnDataType_t cudnn_dtype = CUDNN_DATA_FLOAT;
cudnnNanPropagation_t cudnn_propagation = CUDNN_PROPAGATE_NAN;
cudnnReduceTensorIndices_t cudnn_indices = CUDNN_REDUCE_TENSOR_NO_INDICES;
cudnnIndicesType_t cudnn_indices_type = CUDNN_32BIT_INDICES;
check_cudnn(cudnnCreateReduceTensorDescriptor(&reduce_desc));
check_cudnn(cudnnSetReduceTensorDescriptor(reduce_desc, reduce_op, cudnn_dtype,
cudnn_propagation, cudnn_indices, cudnn_indices_type));
//instantiate 2 new tensors to use new blocks as memory instead of cudaMalloc
size_t reduction_size_int = Product(in.shape());
Shape reduction_size = {reduction_size_int * 100};
Tensor indices(reduction_size, in.device(), in.data_type());
Tensor workspace(reduction_size, in.device(), in.data_type());
size_t indices_bytes = indices.block()->size() * 100;
size_t workspace_bytes = workspace.block()->size() * 100;
size_t* indicesPtr = static_cast<size_t*>(indices.block()->mutable_data());
float* workspacePtr = static_cast<float*>(workspace.block()->mutable_data());
//void* indicesPtr{nullptr}; void* workspacePtr{nullptr};
//cudaMalloc(&indicesPtr, indices_bytes); cudaMalloc(&workspacePtr, workspace_bytes);
float alpha = 1.0;
float beta = 0.0;
check_cudnn(cudnnReduceTensor(ctx->cudnn_handle, reduce_desc,
indicesPtr, indices_bytes, workspacePtr, workspace_bytes,
(void*)(&alpha), generate_tensor_nd_desc(in), inPtr,
(void*)(&beta), generate_tensor_nd_desc(t), tPtr
));
*out = tPtr[0];
#endif // CUDNN_MAJOR < 7
}
} // namespace singa
#endif // USE_CUDA
#endif // SINGA_CORE_TENSOR_TENSOR_MATH_CUDA_H_