blob: 46b82aac9800b6378dedaab0d906e9ee25e66ac1 [file] [log] [blame]
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "singa/core/tensor.h"
#include <algorithm>
#include <utility>
#include "./tensor_math.h"
#include "./tensor_math_cpp.h"
#include "./tensor_math_cuda.h"
#include "./tensor_math_opencl.h"
#define Noaxis 9999
namespace singa {
template half_float::half TypeCast(const float &x);
template float TypeCast(const half_float::half &x);
template int TypeCast(const float &x);
template float TypeCast(const int &x);
Tensor::~Tensor() {
if (block_ != nullptr && block_->DecRefCount() == 0) {
device_->FreeBlock(block_);
}
block_ = nullptr;
}
Tensor::Tensor() {
device_ = defaultDevice;
stride_ = {1};
}
// non-strided constructors
Tensor::Tensor(const Shape &shape, DataType dtype)
: data_type_(dtype), device_(defaultDevice), shape_(shape) {
size_t size = Product(shape_) * SizeOf(data_type_);
if (size) {
block_ = device_->NewBlock((int)size);
}
generate_stride();
}
// non-strided constructors with device
Tensor::Tensor(const Shape &shape, std::shared_ptr<Device> device,
DataType dtype)
: data_type_(dtype), device_(device), shape_(shape) {
size_t size = Product(shape_) * SizeOf(data_type_);
if (size) {
block_ = device_->NewBlock((int)size);
}
generate_stride();
}
Tensor::Tensor(const Tensor &in)
: data_type_(in.data_type_),
device_(in.device_),
block_(in.block()),
shape_(in.shape_),
stride_(in.stride_) {
if (block_ != nullptr) block_->IncRefCount();
}
Tensor::Tensor(Tensor &&in)
: data_type_(in.data_type_),
device_(in.device_),
shape_(std::move(in.shape_)),
stride_(std::move(in.stride_)) {
block_ = in.block_;
in.block_ = nullptr;
}
Tensor &Tensor::ResetLike(const Tensor &in) {
if (block_ == nullptr || device_ != in.device_ || MemSize() != in.MemSize()) {
if (block_ != nullptr && block_->DecRefCount() == 0)
device_->FreeBlock(block_);
device_ = in.device_;
data_type_ = in.data_type_;
block_ = device_->NewBlock((int)in.MemSize());
}
shape_ = in.shape_;
stride_ = in.stride_;
return *this;
}
Tensor &Tensor::Resize(const Shape &shape) {
if (Size() != Product(shape)) {
if (block_ != nullptr && block_->DecRefCount() == 0)
device_->FreeBlock(block_);
block_ = device_->NewBlock((int)(Product(shape) * SizeOf(data_type_)));
}
shape_ = shape;
generate_stride();
return *this;
}
Tensor Resize(const Tensor &in, const Shape &shape) {
Tensor out(in);
out.Resize(shape);
return out;
}
#define TYPE_TYPE_LANG_SWITCH(ldtype, LDType, rdtype, RDType, ltype, Lang, \
...) \
do { \
const int _SwitchShift = 3; \
int _SwitchHash = \
((ldtype) << _SwitchShift * 2) + ((rdtype) << _SwitchShift) + (ltype); \
switch (_SwitchHash) { \
case (((kFloat16) << _SwitchShift * 2) + (kFloat32 << _SwitchShift) + \
kCpp): { \
typedef half_float::half LDType; \
typedef float RDType; \
typedef lang::Cpp Lang; \
{ __VA_ARGS__ } \
break; \
} \
case (((kFloat32) << _SwitchShift * 2) + (kFloat16 << _SwitchShift) + \
kCpp): { \
typedef float LDType; \
typedef half_float::half RDType; \
typedef lang::Cpp Lang; \
{ __VA_ARGS__ } \
break; \
} \
case (((kFloat16) << _SwitchShift * 2) + (kFloat32 << _SwitchShift) + \
kCuda): { \
typedef half_float::half LDType; \
typedef float RDType; \
typedef lang::Cuda Lang; \
{ __VA_ARGS__ } \
break; \
} \
case (((kFloat32) << _SwitchShift * 2) + (kFloat16 << _SwitchShift) + \
kCuda): { \
typedef float LDType; \
typedef half_float::half RDType; \
typedef lang::Cuda Lang; \
{ __VA_ARGS__ } \
break; \
} \
case (((kFloat32) << _SwitchShift * 2) + (kInt << _SwitchShift) + \
kCuda): { \
typedef float LDType; \
typedef int RDType; \
typedef lang::Cuda Lang; \
{ __VA_ARGS__ } \
break; \
} \
case (((kInt) << _SwitchShift * 2) + (kFloat32 << _SwitchShift) + \
kCuda): { \
typedef int LDType; \
typedef float RDType; \
typedef lang::Cuda Lang; \
{ __VA_ARGS__ } \
break; \
} \
case (((kFloat32) << _SwitchShift * 2) + (kInt << _SwitchShift) + \
kCpp): { \
typedef float LDType; \
typedef int RDType; \
typedef lang::Cpp Lang; \
{ __VA_ARGS__ } \
break; \
} \
case (((kInt) << _SwitchShift * 2) + (kFloat32 << _SwitchShift) + \
kCpp): { \
typedef int LDType; \
typedef float RDType; \
typedef lang::Cpp Lang; \
{ __VA_ARGS__ } \
break; \
} \
default: \
LOG(FATAL) << "Unknown combination of left data type " \
<< DataType_Name(ldtype) << " and right data type " \
<< DataType_Name(rdtype) << " and language " \
<< LangType_Name(ltype); \
} \
} while (0)
// return new tensor
Tensor Tensor::AsType(const DataType type) const {
if (data_type_ != type) {
const Tensor &thisRef = *this;
Tensor ret(shape_, device_, type);
TYPE_TYPE_LANG_SWITCH(
data_type_, LDType, type, RDType, device_->lang(), Lang, {
ret.device()->Exec(
[thisRef, ret](Context *ctx) mutable {
CastCopy<LDType, RDType, Lang>(&thisRef, &ret, ctx);
},
{this->block()}, {ret.block()}, "AsType");
});
return ret;
} else {
Tensor t = this->Clone();
return t;
}
}
Tensor &Tensor::ToType(const DataType type) {
CHECK(block() && block()->initialized() == true)
<< "the data of the tensor needs be initialized before casting to "
"another type";
if (data_type_ != type) {
auto ret = this->AsType(type);
std::swap(ret.block_, block_);
data_type_ = type;
}
return *this;
}
Tensor &Tensor::ToDevice(std::shared_ptr<Device> dst) {
// TODO(wangwei) the comparison is restricted. May compare against device ID?
if (device_ != dst) {
// WARNING: this function can't be buffered
Tensor tmp(shape_, dst, data_type_);
if (block_ != nullptr && Size() && block_->initialized())
tmp.CopyData(*this);
if (block_ != nullptr && block_->DecRefCount() == 0)
device_->FreeBlock(block_);
block_ = tmp.block_;
tmp.block_ = nullptr;
device_ = dst;
}
return *this;
}
Tensor &Tensor::ToHost() {
if (device_ != defaultDevice) ToDevice(device_->host());
return *this;
}
template <typename DType>
void Tensor::CopyDataFromHostPtr(const DType *src, const size_t num,
const size_t offset) const {
CHECK_EQ(sizeof(DType), SizeOf(data_type_))
<< "data_type is " << DataType_Name(data_type_)
<< " user given type is of size " << sizeof(DType);
if (src != nullptr) {
Device *dev = device_.get();
const Tensor &thisRef = *this;
size_t nBytes = sizeof(DType) * num;
size_t dst_offset = sizeof(DType) * offset;
device_->Exec(
[dev, thisRef, src, nBytes, dst_offset](Context *ctx) mutable {
dev->CopyDataFromHostPtr(thisRef.block(), src, nBytes, dst_offset,
ctx);
},
{}, {block()}, "CopyDataFromHostPtr");
} else {
LOG(WARNING) << "Copy data from null host ptr";
}
}
template void Tensor::CopyDataFromHostPtr(const unsigned char *src,
const size_t num,
const size_t offset) const;
template void Tensor::CopyDataFromHostPtr(const half_float::half *src,
const size_t num,
const size_t offset) const;
template void Tensor::CopyDataFromHostPtr(const float *src, const size_t num,
const size_t offset) const;
template void Tensor::CopyDataFromHostPtr(const int *src, const size_t num,
const size_t offset) const;
void Tensor::CopyData(const Tensor &src) {
CHECK_EQ(Size(), src.Size());
CHECK(block_ != nullptr);
CHECK_EQ(src.data_type(), data_type_)
<< "Could not copy data between different data type";
// Do copy only if the src's block is already initialized.
if (src.block_ != nullptr) {
singa::CopyDataToFrom(this, src, Size(), 0, 0);
}
}
void Tensor::RepeatData(const vector<size_t> &repeats, int axis,
int total_repeats, const Tensor &src) {
if (repeats.size() == 1) {
CHECK_EQ(Size(), src.Size() * total_repeats);
} else {
CHECK_EQ(Size(), src.Size() * total_repeats / src.shape()[axis]);
}
CHECK(block_ != nullptr);
// Do repeat only if the src's block is already initialized.
if (src.block_ != nullptr) {
singa::RepeatDataToFrom(false, repeats, axis, this, src, Size());
}
}
void Tensor::FromProto(const singa::TensorProto &proto) {
if (block_ != nullptr && block_->DecRefCount() == 0)
device_->FreeBlock(block_);
block_ = nullptr;
for (uint32_t s : proto.shape()) shape_.push_back(s);
data_type_ = proto.data_type();
block_ = device_->NewBlock((int)(Product(shape()) * SizeOf(data_type_)));
// transpose_ = proto.transpose();
stride_.clear();
for (int32_t s : proto.stride()) stride_.push_back(s);
switch (data_type_) {
case kFloat32: {
std::unique_ptr<float[]> data_ptr(new float[Product(shape_)]);
for (size_t i = 0; i < Product(shape_); ++i)
data_ptr[i] = static_cast<float>(proto.float_data((int)i));
CopyDataFromHostPtr<float>(data_ptr.get(), Product(shape_));
break;
}
case kDouble: {
std::unique_ptr<double[]> data(new double[Product(shape_)]);
for (size_t i = 0; i < Product(shape_); ++i)
data[i] = proto.double_data((int)i);
CopyDataFromHostPtr<double>(data.get(), Product(shape_));
break;
}
case kInt: {
std::unique_ptr<int[]> data(new int[Product(shape_)]);
for (size_t i = 0; i < Product(shape_); ++i)
data[i] = proto.int_data((int)i);
CopyDataFromHostPtr<int>(data.get(), Product(shape_));
break;
}
/// TODO(wangji): Implement to support C++ type char using bytes type in
/// protobuf
/// which is equivalent to string type is different from the other cases.
/// The kchar
/// and kUChar case is to be implemented.
/*
case kChar: {
std::unique_ptr<char[]> data(new char[Product(shape_)]);
for (size_t i = 0; i < Product(shape_); ++i)
data[i] = static_cast<char>(proto.bytes_data(i));
break;
}
case kUChar: {
std::unique_ptr<unsigned char[]> data(new unsigned char[Product(shape_)]);
for (size_t i = 0; i < Product(shape_); ++i)
data[i] = static_cast<unsigned char>(proto.bytes_data(i));
break;
}
*/
default: {
LOG(FATAL) << "Unsupported Type" << DataType_Name(data_type_);
}
}
}
void Tensor::to_proto(singa::TensorProto *proto) const {
proto->clear_shape();
for (auto s : shape_) {
proto->add_shape(s);
}
proto->set_data_type(data_type_);
// proto->set_transpose(transpose_);
proto->clear_stride();
for (auto s : stride_) {
proto->add_stride(s);
}
switch (data_type_) {
case kFloat32: {
proto->clear_float_data();
const float *data_ptr = data<float>();
for (size_t i = 0; i < Product(shape_); ++i)
proto->add_float_data(data_ptr[i]);
break;
}
case kDouble: {
proto->clear_double_data();
const double *data_ptr = data<double>();
for (size_t i = 0; i < Product(shape_); ++i)
proto->add_double_data(data_ptr[i]);
break;
}
case kInt: {
proto->clear_int_data();
const int *data_ptr = data<int>();
for (size_t i = 0; i < Product(shape_); ++i)
proto->add_int_data(data_ptr[i]);
break;
}
/*
case kChar: {
proto->clear_bytes_data();
const char *data = data<char>();
for (size_t i = 0; i < Product(shape_); ++i)
proto->add_bytes_data(static_cast<unsigned char>(data[i]));
break;
}
case kUChar: {
proto->clear_bytes_data();
const unsigned char *data = data<unsigned char>();
for (size_t i = 0; i < Product(shape_); ++i)
proto->add_bytes_data(static_cast<unsigned char>(data[i]));
break;
}
*/
default: {
LOG(FATAL) << "Unsupported Type" << DataType_Name(data_type_);
}
}
}
void Tensor::ToProto(singa::TensorProto *proto) const { to_proto(proto); }
Tensor Tensor::Repeat(const vector<size_t> &repeats, int axis,
std::shared_ptr<Device> device) {
if (device == nullptr) device = device_;
vector<size_t> tshape;
int total_repeats = 0;
if (axis == Noaxis) {
total_repeats = repeats[0];
tshape.push_back(Product(shape_) * total_repeats);
} else {
if (repeats.size() == 1) {
total_repeats = repeats[0];
for (int i = 0; i < static_cast<int>(shape_.size()); i++) {
if (i == axis) {
tshape.push_back(shape_[i] * total_repeats);
} else {
tshape.push_back(shape_[i]);
}
}
} else {
if (repeats.size() != shape_[axis]) {
LOG(FATAL) << "the repeats number doesn't match the axis";
}
for (size_t i = 0; i < shape_[axis]; i++) {
if (repeats[i] < 0) {
LOG(FATAL) << "the repeats number is less than zero";
}
total_repeats += repeats[i];
}
for (int i = 0; i < static_cast<int>(shape_.size()); i++) {
if (i == axis) {
tshape.push_back(total_repeats);
} else {
tshape.push_back(shape_[i]);
}
}
}
}
Tensor t(tshape, device_);
// t.stride_.push_back(1);
t.RepeatData(repeats, axis, total_repeats, *this);
return t;
}
Tensor Tensor::Clone(std::shared_ptr<Device> device) const {
if (device == nullptr) device = device_;
Tensor t(shape_, device, data_type_);
// t.transpose_ = transpose_;
t.stride_ = stride_;
t.CopyData(*this);
return t;
}
void Tensor::Clone(Tensor *&other, std::shared_ptr<Device> device) const {
if (device == nullptr) device = device_;
other = new Tensor(shape_, device, data_type_);
other->stride_ = stride_;
other->CopyData(*this);
return;
}
Tensor &Tensor::Broadcast(const Shape &shape, const int ignore_last_dim) {
// TODO(wangwei) do we need to transform the mem layout if the tensor was
// transposed?
auto m = shape_.size() - 1, n = shape.size() - 1;
// ignore_last_dim is useful for mult broadcast
// e.g. (2,3,4)x(4,5) to (2,3,4)x(2,4,5)
if (ignore_last_dim < std::min(m, n) + 1) {
for (size_t i = ignore_last_dim; i <= std::min(m, n); i++) {
if ((shape.at(n - i) != shape_.at(m - i)) && (shape.at(n - i) != 1)) {
CHECK_EQ(shape_.at(m - i), 1) << "i= " << i << "\n"; // << Backtrace();
shape_.at(m - i) = shape.at(n - i);
stride_.at(m - i) = 0;
}
}
}
if (m < n) {
for (size_t i = m + 1; i <= n; i++) {
shape_.emplace(shape_.begin(), shape.at(n - i));
stride_.emplace(stride_.begin(), 0);
}
}
return *this;
}
Tensor Broadcast(const Tensor &in, const Shape &shape,
const int ignore_last_dim) {
Tensor out(in);
return out.Broadcast(shape, ignore_last_dim);
}
Tensor &Tensor::T() {
// this function only works for 2d tensors
CHECK_EQ(shape_.size(), 2u);
Transpose();
return *this;
}
// normal transpose without axes
Tensor &Tensor::Transpose() {
std::reverse(shape_.begin(), shape_.end());
std::reverse(stride_.begin(), stride_.end());
return *this;
}
// transpose with axes
Tensor &Tensor::Transpose(const vector<size_t> &axes) {
CHECK_EQ(axes.size(), shape_.size())
<< "Tranpose axes's length should be equal to shape";
auto shape = shape_;
auto stride = stride_;
shape_.clear();
stride_.clear();
for (size_t n = 0; n < axes.size(); ++n) {
shape_.push_back(shape[axes[n]]);
stride_.push_back(stride[axes[n]]);
}
return *this;
}
// normal transpose without axes
Tensor Transpose(const Tensor &in) {
Tensor out(in);
out.Transpose();
return out;
}
// transpose with axes
Tensor Transpose(const Tensor &in, const vector<size_t> &axes) {
Tensor out(in);
out.Transpose(axes);
return out;
}
Tensor &Tensor::operator=(const Tensor &in) {
if (block_ != nullptr && block_->DecRefCount() == 0)
device_->FreeBlock(block_);
stride_ = in.stride_;
data_type_ = in.data_type_;
shape_ = in.shape_;
device_ = in.device_;
block_ = in.block();
if (block_ != nullptr) block_->IncRefCount();
return *this;
}
Tensor &Tensor::operator=(Tensor &&in) {
if (block_ != nullptr && block_->DecRefCount() == 0)
device_->FreeBlock(block_);
stride_ = std::move(in.stride_);
data_type_ = in.data_type_;
shape_ = std::move(in.shape_);
device_ = in.device_;
block_ = in.block_;
in.block_ = nullptr;
return *this;
}
#define GenUnaryTensorArgMemberFn(op, fn) \
Tensor &Tensor::op(const Tensor &in) { \
Tensor out(*this); \
fn(*this, in, &out); \
return *this; \
}
GenUnaryTensorArgMemberFn(operator+=, Add);
GenUnaryTensorArgMemberFn(operator-=, Sub);
GenUnaryTensorArgMemberFn(operator*=, EltwiseMult);
GenUnaryTensorArgMemberFn(operator/=, Div);
#define GenUnaryScalarArgMemberFn(op, fn) \
template <typename DType> \
Tensor &Tensor::op(const DType x) { \
Tensor out(*this); \
fn(*this, x, &out); \
return *this; \
} \
template Tensor &Tensor::op<float>(const float x)
GenUnaryScalarArgMemberFn(operator-=, Sub);
GenUnaryScalarArgMemberFn(operator+=, Add);
GenUnaryScalarArgMemberFn(operator*=, EltwiseMult);
GenUnaryScalarArgMemberFn(operator/=, Div);
// ====================Tensor Operations=======================================
void CopyDataToFrom(Tensor *dst, const Tensor &src, const size_t num,
const size_t dst_offset, const size_t src_offset) {
auto width = SizeOf(src.data_type());
CHECK_EQ(width, SizeOf(dst->data_type()));
size_t nBytes = num * width;
auto d_offset = dst_offset * width;
auto s_offset = src_offset * width;
CHECK_GE(src.MemSize(), s_offset + nBytes);
CHECK_GE(dst->MemSize(), d_offset + nBytes);
Device *dev = nullptr;
CopyDirection direct;
std::shared_ptr<Device> src_dev = src.device(), dst_dev = dst->device();
if (dst_dev->lang() != src_dev->lang()) {
// let the none cpp device conduct copy op
if (dst_dev->lang() == kCpp) {
dev = src_dev.get();
direct = kDeviceToHost;
} else if (src_dev->lang() == kCpp) {
dev = dst_dev.get();
direct = kHostToDevice;
} else {
LOG(FATAL) << "Not support mem copy between Cuda and OpenCL device";
}
} else {
dev = src_dev.get();
direct = src_dev->lang() == kCpp ? kHostToHost : kDeviceToDevice;
}
Tensor &dstRef = *dst;
dev->Exec(
[dev, dstRef, src, nBytes, direct, d_offset,
s_offset](Context *ctx) mutable {
Block *from = src.block(), *to = dstRef.block();
dev->CopyDataToFrom(to, from, nBytes, direct, (int)d_offset,
(int)s_offset, ctx);
},
{src.block()}, {dst->block()}, "CopyDataToFrom");
}
void RepeatDataToFrom(bool broadcast_flag, const vector<size_t> &repeats,
int axis, Tensor *dst, const Tensor &src,
const size_t num) {
if (repeats.size() == 1) {
broadcast_flag = true;
} else if (repeats.size() > 1) {
if (axis == Noaxis) {
LOG(FATAL) << "When repeats parameter is sequence, axis cannot be None";
}
}
for (size_t i = 0; i < repeats.size(); i++) {
CHECK_GE(repeats[i], 0);
}
auto width = SizeOf(src.data_type());
CHECK_EQ(width, SizeOf(dst->data_type()));
// size_t nBytes = num * width;
int chunk = width;
int axis_shape = 1;
int shape_outer = 1;
if (axis == Noaxis) {
axis_shape = 1;
shape_outer = Product(src.shape());
} else {
for (int i = 0; i < axis; i++) {
shape_outer *= src.shape()[i];
}
axis_shape = src.shape()[axis];
for (int i = axis + 1; i < static_cast<int>(src.nDim()); i++) {
chunk *= src.shape()[i];
}
}
Device *dev = nullptr;
CopyDirection direct;
std::shared_ptr<Device> src_dev = src.device(), dst_dev = dst->device();
if (dst_dev->lang() != src_dev->lang()) {
// let the none cpp device conduct copy op
if (dst_dev->lang() == kCpp) {
dev = src_dev.get();
direct = kDeviceToHost;
} else if (src_dev->lang() == kCpp) {
dev = dst_dev.get();
direct = kHostToDevice;
} else {
LOG(FATAL)
<< "Not support mem repeat copy between Cuda and OpenCL device";
}
} else {
dev = src_dev.get();
direct = src_dev->lang() == kCpp ? kHostToHost : kDeviceToDevice;
}
int dst_offset = 0;
int src_offset = 0;
Tensor &dstRef = *dst;
for (int i = 0; i < shape_outer; i++) {
for (int j = 0; j < axis_shape; j++) {
int temp = broadcast_flag ? repeats[0] : repeats[j];
for (int k = 0; k < temp; k++) {
dev->Exec(
[dev, dstRef, src, chunk, direct, dst_offset,
src_offset](Context *ctx) mutable {
Block *from = src.block(), *to = dstRef.block();
dev->CopyDataToFrom(to, from, chunk, direct, dst_offset,
src_offset, ctx);
},
{src.block()}, {dst->block()}, "CopyDataToFrom");
dst_offset += chunk;
}
src_offset += chunk;
}
}
}
//============================================================================
/// typedef DType accroding to type value.
/// DType would be used in the code block __VA_ARGS__.
#define TYPE_SWITCH(type, DType, ...) \
do { \
switch (type) { \
case kFloat16: { \
typedef half_float::half DType; \
{ __VA_ARGS__ } \
break; \
} \
case kFloat32: { \
typedef float DType; \
{ __VA_ARGS__ } \
break; \
} \
case kInt: { \
typedef int DType; \
{ __VA_ARGS__ } \
break; \
} \
case kChar: { \
typedef char DType; \
{ __VA_ARGS__ } \
break; \
} \
case kDouble: { \
typedef double DType; \
{ __VA_ARGS__ } \
break; \
} \
default: \
LOG(FATAL) << "Unknow data type = " << DataType_Name(type); \
} \
} while (0)
/// typedef DType and Lang according to data type and device programming
/// language respectively.
/// type is from DataType, and lang is from LangType.
/// DType and Lang would be used in __VA_ARGS__.
#define TYPE_LANG_SWITCH(dtype, DType, ltype, Lang, ...) \
do { \
const int _SwitchShift = 3; \
int _SwitchHash = ((dtype) << _SwitchShift) + (ltype); \
switch (_SwitchHash) { \
case ((kFloat16 << _SwitchShift) + kCpp): { \
typedef half_float::half DType; \
typedef lang::Cpp Lang; \
{ __VA_ARGS__ } \
break; \
} \
case ((kFloat16 << _SwitchShift) + kCuda): { \
typedef half_float::half DType; \
typedef lang::Cuda Lang; \
{ __VA_ARGS__ } \
break; \
} \
case ((kFloat32 << _SwitchShift) + kCuda): { \
typedef float DType; \
typedef lang::Cuda Lang; \
{ __VA_ARGS__ } \
break; \
} \
case ((kInt << _SwitchShift) + kCuda): { \
typedef int DType; \
typedef lang::Cuda Lang; \
{ __VA_ARGS__ } \
break; \
} \
case ((kFloat32 << _SwitchShift) + kCpp): { \
typedef float DType; \
typedef lang::Cpp Lang; \
{ __VA_ARGS__ } \
break; \
} \
case ((kInt << _SwitchShift) + kCpp): { \
typedef int DType; \
typedef lang::Cpp Lang; \
{ __VA_ARGS__ } \
break; \
} \
case ((kFloat32 << _SwitchShift) + kOpencl): { \
typedef float DType; \
typedef lang::Opencl Lang; \
{ __VA_ARGS__ } \
break; \
} \
default: \
LOG(FATAL) << "Unknown combination of data type " \
<< DataType_Name(dtype) << " and language " \
<< LangType_Name(ltype); \
} \
} while (0)
// =============Element-wise operations====================================
float Tensor::l1() const {
float nrm = 0.0f;
TYPE_LANG_SWITCH(data_type_, DType, device_->lang(), Lang, {
device_->Exec(
[&nrm, this](Context *ctx) {
DType ret = DType(0);
Asum<DType, Lang>(*this, &ret, ctx);
nrm = TypeCast<DType, float>(ret);
},
{this->block()}, {}, "l1");
});
return nrm / Size();
}
// DEPRECATED use l1()
float Tensor::L1() const { return l1(); }
/// L2 norm, Do not use Nrm2 (name conflict).
float Tensor::l2() const {
float nrm = 0.0f;
TYPE_LANG_SWITCH(data_type_, DType, device_->lang(), Lang, {
device_->Exec(
[&nrm, this](Context *ctx) {
Nrm2<DType, Lang>(*this, &nrm, ctx);
},
{this->block()}, {}, "L1");
});
return nrm / Size();
}
// DEPRECATED use l2()
float Tensor::L2() const { return l2(); }
template <typename SType>
void Tensor::SetValue(const SType x) {
// auto size = Size();
auto ptr = block_;
TYPE_LANG_SWITCH(data_type_, DType, device_->lang(), Lang, {
DType tmp = TypeCast<SType, DType>(x);
Tensor &thisRef = *this;
device_->Exec(
[thisRef, tmp](Context *ctx) mutable {
Set<DType, Lang>(tmp, &thisRef, ctx);
},
{}, {ptr}, "SetValue");
});
}
template void Tensor::SetValue<float>(const float x);
template void Tensor::SetValue<half_float::half>(const half_float::half x);
template void Tensor::SetValue<int>(const int x);
template <typename SType>
void Tensor::get_value(SType *value, const size_t num) const {
CHECK(device_ == defaultDevice);
Tensor t(shape_, device_, data_type_);
// transform function arrange data in memory considering stride
singa::Transform(*this, &t);
auto ptr = static_cast<const SType *>(t.block()->data());
for (size_t i = 0; i < num; i++) value[i] = ptr[i];
}
template void Tensor::get_value<float>(float *value, const size_t num) const;
template void Tensor::get_value<half_float::half>(half_float::half *value,
const size_t num) const;
template void Tensor::get_value<int>(int *value, const size_t num) const;
// DEPRECATED
template <typename SType>
void Tensor::GetValue(SType *value, const size_t num) const {
get_value(value, num);
}
template void Tensor::GetValue<float>(float *value, const size_t num) const;
template void Tensor::GetValue<int>(int *value, const size_t num) const;
#define EltwiseUnaryTensorFn(fn, t, ret) \
do { \
TYPE_LANG_SWITCH(t.data_type(), DType, t.device()->lang(), Lang, { \
Tensor &retRef = *ret; \
ret->device()->Exec( \
[t, retRef](Context *ctx) mutable { \
fn<DType, Lang>(t, &retRef, ctx); \
}, \
{t.block()}, {ret->block()}, #fn); \
}); \
} while (0)
#define GenUnaryTensorFn(fn) \
Tensor fn(const Tensor &in) { \
Tensor ret(in.shape(), in.device(), in.data_type()); \
Tensor *retptr = &ret; \
EltwiseUnaryTensorFn(fn, in, retptr); \
return ret; \
} \
void fn(const Tensor &in, Tensor *out) { EltwiseUnaryTensorFn(fn, in, out); }
GenUnaryTensorFn(Abs);
GenUnaryTensorFn(Erf);
GenUnaryTensorFn(Ceil);
GenUnaryTensorFn(Floor);
GenUnaryTensorFn(Round);
GenUnaryTensorFn(RoundE);
GenUnaryTensorFn(Exp);
GenUnaryTensorFn(Log);
GenUnaryTensorFn(ReLU);
GenUnaryTensorFn(Sigmoid);
GenUnaryTensorFn(SoftPlus);
GenUnaryTensorFn(SoftSign);
GenUnaryTensorFn(Sign);
GenUnaryTensorFn(Sqrt);
GenUnaryTensorFn(Square);
GenUnaryTensorFn(Transform);
GenUnaryTensorFn(Cos);
GenUnaryTensorFn(Cosh);
GenUnaryTensorFn(Acos);
GenUnaryTensorFn(Acosh);
GenUnaryTensorFn(Sin);
GenUnaryTensorFn(Sinh);
GenUnaryTensorFn(Asin);
GenUnaryTensorFn(Asinh);
GenUnaryTensorFn(Tan);
GenUnaryTensorFn(Tanh);
GenUnaryTensorFn(Atan);
GenUnaryTensorFn(Atanh);
GenUnaryTensorFn(SoftMax);
// add axis to softmax API according to ONNX specification
// https://github.com/onnx/onnx/blob/master/docs/Operators.md#Softmax
void SoftMax(const Tensor &in, Tensor *out, int axis) {
// {a_0, a_1, ..., a_k-1, a_k, ... a_n-1}
// reshape to
// { a_0 * a_1 * ... a_k-1, a_k * ... a_n-1 }
// assert axis \in {-r, r-1}
CHECK_LE(axis, (int)in.shape().size() - 1);
CHECK_GE(axis, -1 * (int)in.nDim());
Shape original_shape = in.shape();
if (axis < 0) axis = in.shape().size() + axis;
Shape coerced_shape = {1, 1};
for (std::size_t i = 0, max = in.shape().size(); i != max; ++i) {
if (i < axis)
coerced_shape[0] *= in.shape()[i];
else
coerced_shape[1] *= in.shape()[i];
}
Tensor in_reshaped = Reshape(in, coerced_shape);
out->Reshape(coerced_shape);
// optimise by minus x - x.max()
auto in_max = RowMax(in_reshaped);
in_max.Reshape({coerced_shape[0], 1});
in_reshaped = in_reshaped - in_max;
SoftMax(in_reshaped, out);
out->Reshape(original_shape);
}
Tensor SoftMax(const Tensor &in, int axis) {
Tensor ret(in.shape(), in.device(), in.data_type());
auto *retptr = &ret;
SoftMax(in, retptr, axis);
return ret;
}
void SoftMaxBackward(const Tensor &in, Tensor *out, int axis,
const Tensor &fdout) {
// {a_0, a_1, ..., a_k-1, a_k, ... a_n-1}
// reshape to
// { a_0 * a_1 * ... a_k-1, a_k * ... a_n-1 }
// assert axis \in {-r, r-1}
CHECK_LE(axis, (int)in.shape().size() - 1);
CHECK_GE(axis, -1 * (int)in.nDim());
Shape original_shape = in.shape();
if (axis < 0) axis = in.shape().size() + axis;
Shape coerced_shape = {1, 1};
for (std::size_t i = 0, max = in.shape().size(); i != max; ++i) {
if (i < axis)
coerced_shape[0] *= in.shape()[i];
else
coerced_shape[1] *= in.shape()[i];
}
Tensor in_reshaped = Reshape(in, coerced_shape);
out->Reshape(coerced_shape);
do {
TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
Tensor &outRef = *out;
out->device()->Exec(
[in, outRef, fdout](Context *ctx) mutable {
SoftMaxBackward<DType, Lang>(in, &outRef, fdout, ctx);
},
{in.block(), fdout.block()}, {out->block()}, "SoftmaxBackward");
});
} while (0);
out->Reshape(original_shape);
}
Tensor SoftMaxBackward(const Tensor &in, int axis, const Tensor &fdout) {
Tensor ret(in.shape(), in.device(), in.data_type());
auto *retptr = &ret;
SoftMaxBackward(in, retptr, axis, fdout);
return ret;
}
#define EltwiseBinaryTensorFn(fn, lhs, rhs, ret) \
do { \
TYPE_LANG_SWITCH(lhs.data_type(), DType, lhs.device()->lang(), Lang, { \
CHECK_EQ(sizeof(DType), SizeOf(rhs.data_type())) \
<< "lhs dtype size" << sizeof(DType) << " rhs dtype size" \
<< SizeOf(rhs.data_type()); \
Tensor &retRef = *ret; \
ret->device()->Exec( \
[lhs, rhs, retRef](Context *ctx) mutable { \
fn<DType, Lang>(lhs, rhs, &retRef, ctx); \
}, \
{lhs.block(), rhs.block()}, {ret->block()}, #fn); \
}); \
} while (0)
#define GenBinaryTensorFn(op, fn) \
Tensor op(const Tensor &lhs, const Tensor &rhs) { \
if (lhs.shape() != rhs.shape()) { \
if (lhs.data_type() == kFloat32 && rhs.data_type() == kFloat32) { \
auto lhs_ = Broadcast(lhs, rhs.shape()); \
auto rhs_ = Broadcast(rhs, lhs.shape()); \
Tensor ret(lhs_.shape(), lhs.device(), lhs.data_type()); \
fn(lhs_, rhs_, &ret); \
return ret; \
} else { \
/* lhs tensor and rhs tensor are not both in float, cast to float */\
Tensor tmp_lhs = lhs.Clone().AsType(kFloat32); \
Tensor tmp_rhs = rhs.Clone().AsType(kFloat32); \
tmp_lhs = Broadcast(tmp_lhs, tmp_rhs.shape()); \
tmp_rhs = Broadcast(tmp_rhs, tmp_lhs.shape()); \
Tensor ret(tmp_lhs.shape(), tmp_lhs.device(), tmp_lhs.data_type()); \
fn(tmp_lhs, tmp_rhs, &ret); \
/* if lhs and rhs are both int, cast back to int */ \
if (lhs.data_type() == kInt && rhs.data_type() == kInt) \
return ret.Clone().AsType(kInt); \
return ret; \
} \
} else { \
if (lhs.data_type() == kFloat32 && rhs.data_type() == kFloat32) { \
Tensor ret(lhs.shape(), lhs.device(), lhs.data_type()); \
fn(lhs, rhs, &ret); \
return ret; \
} else { \
/* lhs tensor and rhs tensor are not both in float, cast to float */\
Tensor tmp_lhs = lhs.Clone().AsType(kFloat32); \
Tensor tmp_rhs = rhs.Clone().AsType(kFloat32); \
Tensor ret(tmp_lhs.shape(), tmp_lhs.device(), tmp_lhs.data_type()); \
fn(tmp_lhs, tmp_rhs, &ret); \
/* if lhs and rhs are both int, cast back to int */ \
if (lhs.data_type() == kInt && rhs.data_type() == kInt) \
return ret.Clone().AsType(kInt); \
return ret; \
} \
} \
} \
void fn(const Tensor &lhs, const Tensor &rhs, Tensor *ret) { \
CHECK_EQ(lhs.device(), ret->device()); \
CHECK_EQ(rhs.device(), ret->device()); \
if (lhs.shape() != rhs.shape()) { \
auto lhs_ = Broadcast(lhs, rhs.shape()); \
auto rhs_ = Broadcast(rhs, lhs.shape()); \
CHECK(lhs_.shape() == ret->shape()); \
EltwiseBinaryTensorFn(fn, lhs_, rhs_, ret); \
} else { \
CHECK(lhs.shape() == ret->shape()); \
EltwiseBinaryTensorFn(fn, lhs, rhs, ret); \
} \
}
// boradcasting operations:
// https://github.com/onnx/onnx/blob/master/docs/Broadcasting.md
GenBinaryTensorFn(operator+, Add);
GenBinaryTensorFn(operator-, Sub);
GenBinaryTensorFn(operator*, EltwiseMult);
GenBinaryTensorFn(operator/, Div);
GenBinaryTensorFn(Pow, Pow);
GenBinaryTensorFn(operator<, LT);
GenBinaryTensorFn(operator<=, LE);
GenBinaryTensorFn(operator>, GT);
GenBinaryTensorFn(operator>=, GE);
GenBinaryTensorFn(operator==, EQ);
GenBinaryTensorFn(ReLUBackward, ReLUBackward);
#define EltwiseTensorScalarFn(fn, t, x, ret) \
do { \
TYPE_LANG_SWITCH(t.data_type(), DType, t.device()->lang(), Lang, { \
DType tmp_x = TypeCast<SType, DType>(x); \
Tensor &retRef = *ret; \
ret->device()->Exec( \
[t, tmp_x, retRef](Context *ctx) mutable { \
fn<DType, Lang>(t, tmp_x, &retRef, ctx); \
}, \
{t.block()}, {ret->block()}, #fn); \
}); \
} while (0)
#define GenTensorScalarFn(op, fn) \
template <typename SType> \
Tensor op(const Tensor &in, const SType x) { \
if (in.data_type() == kFloat32 && std::is_same<SType, float>::value){ \
Tensor ret(in.shape(), in.device(), in.data_type()); \
fn(in, x, &ret); \
return ret; \
} else if (in.data_type() == kFloat32) { \
Tensor ret(in.shape(), in.device(), in.data_type()); \
float tmp_x = x; \
fn(in, tmp_x, &ret); \
return ret; \
} else { \
/* tensor and scalar are not both in float, cast to float */ \
Tensor tmp_in = in.Clone().AsType(kFloat32); \
float tmp_x = x; \
Tensor ret(tmp_in.shape(), tmp_in.device(), tmp_in.data_type()); \
fn(tmp_in, tmp_x, &ret); \
/* if tensor and scalar are both int, cast back to int */ \
if (in.data_type() == kInt && std::is_same<SType, int>::value) \
return ret.Clone().AsType(kInt); \
return ret; \
} \
} \
template <typename SType> \
void fn(const Tensor &in, const SType x, Tensor *ret) { \
EltwiseTensorScalarFn(fn, in, x, ret); \
} \
template Tensor op<float>(const Tensor &in, const float x); \
template void fn<float>(const Tensor &in, const float x, Tensor *ret)
GenTensorScalarFn(operator+, Add);
GenTensorScalarFn(operator-, Sub);
GenTensorScalarFn(operator*, EltwiseMult);
GenTensorScalarFn(operator/, Div);
GenTensorScalarFn(Pow, Pow);
GenTensorScalarFn(operator<, LT);
GenTensorScalarFn(operator<=, LE);
GenTensorScalarFn(operator>, GT);
GenTensorScalarFn(operator>=, GE);
GenTensorScalarFn(operator==, EQ);
template <typename SType>
Tensor Div(const SType alpha, const Tensor &in) {
Tensor out(in.shape(), in.device(), in.data_type());
Div(alpha, in, &out);
return out;
}
template Tensor Div<float>(const float, const Tensor &);
template <typename SType>
void Div(const SType alpha, const Tensor &in, Tensor *out) {
CheckDataTypeAndLang(in, *out);
CHECK(in.shape() == out->shape());
TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
DType tmp_alpha = TypeCast<SType, DType>(alpha);
Tensor &outRef = *out;
in.device()->Exec(
[tmp_alpha, in, outRef](Context *ctx) mutable {
Div<DType, Lang>(tmp_alpha, in, &outRef, ctx);
},
{in.block()}, {out->block()}, "Div");
});
}
template void Div<float>(const float, const Tensor &, Tensor *);
// =============Matrix operations============================================
Tensor Average(const Tensor &M, int axis) {
// operator/ only has implementation for float scalar type, hence it is
// necessary to cast the denominator to a float.
// TODO(wangwei) implement function for cast scalar type involved in Tensor
// functions. E.g.,
// template<S, D>
// D CastTo(S x) {
// return D(x);
// }
// for speical types, e.g., fp16:
// tempalte<>
// fp16 CastType(float x) {
// ....
// }
if (axis == 0) {
return Sum(M, 0) / (1.0f * M.shape(0));
} else if (axis == 1) {
return Sum(M, 1) / (1.0f * M.shape(1));
} else {
LOG(FATAL) << "Not currently support Sum over axis = " << axis;
}
}
// TODO(wangwei) conside async exec
template <>
float Sum<float>(const Tensor &in) {
float s = 0.0f;
Tensor one(in.shape(), in.device(), in.data_type());
one.SetValue(1.0f);
TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
one.device()->Exec(
// cannot use this sum function in computational graph
[in, one, &s](Context *ctx) mutable {
DType ret = DType(0);
Dot<DType, Lang>(in, one, &ret, ctx);
s = ret;
},
{in.block(), one.block()}, {}, "Sum");
});
return s;
}
Tensor Sum(const Tensor &M, int axis) {
if (axis == 0) {
Tensor out(Shape{M.shape(1)}, M.device(), M.data_type());
SumRows(M, &out);
return out;
} else {
CHECK_EQ(axis, 1) << "Not support Sum over axis = " << axis;
Tensor out = Tensor(Shape{M.shape(0)}, M.device(), M.data_type());
SumColumns(M, &out);
return out;
}
}
Tensor SumAll(const Tensor &in) {
Tensor out({(size_t)1}, in.device(), in.data_type());
Tensor one(in.shape(), in.device(), in.data_type());
one.SetValue(1.0f);
TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
one.device()->Exec(
[in, one, out](Context *ctx) mutable {
Dot<DType, Lang>(in, one, &out, ctx);
},
{in.block(), one.block()}, {out.block()}, "SumAll");
});
return out;
}
Tensor RowMax(const Tensor &in) {
Tensor ret({in.shape(0)}, in.device(), in.data_type());
TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
in.device()->Exec(
[in, ret](Context *ctx) mutable {
// size_t nrow = 1;
// if (in.nDim() > 1) nrow = in.shape(0);
// size_t ncol = in.Size() / nrow;
RowMax<DType, Lang>(in, &ret, ctx);
},
{in.block()}, {ret.block()}, "RowMax");
});
return ret;
}
void AddColumn(const Tensor &v, Tensor *M) { AddColumn(1, 1, v, M); }
/// Add column 'v' onto each column of matrix M;
template <typename SType>
void AddColumn(const SType alpha, const SType beta, const Tensor &v,
Tensor *M) {
if (M->transpose()) {
Tensor X(Transpose(*M));
AddRow(v, &X);
} else {
CHECK_EQ(M->nDim(), 2u);
// CHECK_EQ(v.nDim(), 1u); (chonho) shape of v is 2-element tuple
size_t nb_row = M->shape(0), nb_col = M->shape(1);
CHECK_EQ(nb_row, v.Size());
Tensor one(Shape{1, nb_col}, M->device(), M->data_type());
one.SetValue(1.0f); // TODO(wangwei) cast type
Tensor vmat(Reshape(v, Shape{nb_row, 1}));
Mult(alpha, vmat, one, beta, M);
}
}
template void AddColumn(const float alpha, const float beta, const Tensor &v,
Tensor *M);
void AddRow(const Tensor &v, Tensor *M) { AddRow(1, 1, v, M); }
/// Add row 'v' by each column of matrix M; write results into 'out'
template <typename SType>
void AddRow(const SType alpha, const SType beta, const Tensor &v, Tensor *M) {
if (M->transpose()) {
Tensor X(Transpose(*M));
AddColumn(v, &X);
} else {
CHECK_EQ(M->nDim(), 2u);
// CHECK_EQ(v.nDim(), 1u); (chonho) shape of v is 2-element tuple
size_t nb_row = M->shape(0), nb_col = M->shape(1);
CHECK_EQ(nb_col, v.Size());
Tensor one(Shape{nb_row, 1}, M->device(), M->data_type());
one.SetValue(1.0f);
Tensor vmat(Reshape(v, Shape{1, nb_col}));
Mult(alpha, one, vmat, beta, M);
}
}
template void AddRow(const float alpha, const float beta, const Tensor &v,
Tensor *M);
/// Divide column 'v' by each column of matrix M; write results into 'out'
void DivColumn(const Tensor &v, Tensor *M) {
Tensor inv;
TYPE_SWITCH(v.data_type(), DType, { inv = Div(DType(1), v); });
MultColumn(inv, M);
}
Tensor ConcatOn(const std::vector<Tensor> &in, int axis) {
vector<Tensor> tmp;
Shape out_shape = in[0].shape();
size_t dim = in[0].shape().size();
// CHECK_GE(dim, 2u) << " Only work for tensor of dim >=2 ";
size_t size = in[0].Size() / in[0].shape(axis);
size_t new_size = 0u;
for (const auto &t : in) {
CHECK_EQ(dim, t.shape().size()) << "All tensors should have the same dim";
CHECK_EQ(size, t.Size() / t.shape(axis))
<< "The size of all axis should "
<< " be the same except the concatenated axis";
new_size += t.shape(axis);
}
out_shape[axis] = new_size;
if (axis == 0) {
size_t nrow = 0;
for (const auto &t : in) {
nrow += t.shape(0);
tmp.push_back(Reshape(t, {t.shape(0), t.Size() / t.shape(0)}));
}
auto ret = ConcatenateRows(tmp);
ret.Reshape(out_shape);
return ret;
} else {
for (const auto &t : in) {
size_t nrow = 1;
for (int i = 0; i < axis; i++) nrow *= t.shape(i);
tmp.push_back(Reshape(t, {nrow, t.Size() / nrow}));
}
auto ret = ConcatenateColumns(tmp);
ret.Reshape(out_shape);
return ret;
}
}
Tensor ConcatenateRows(const vector<Tensor> &in) {
size_t nrow = 0, ncol = 0;
CHECK(in.size());
for (const auto &x : in) {
CHECK(!x.transpose());
CHECK_EQ(x.nDim(), 2u);
nrow += x.shape(0);
if (ncol == 0)
ncol = x.shape(1);
else
CHECK_EQ(ncol, x.shape(1));
}
Tensor out(Shape{nrow, ncol}, in.at(0).device(), in.at(0).data_type());
size_t dst_offset = 0;
for (const auto &x : in) {
CopyDataToFrom(&out, x, x.Size(), dst_offset, 0);
dst_offset += x.Size();
}
return out;
}
Tensor ConcatRows(const vector<Tensor> &in) { return ConcatenateRows(in); }
// TODO(wangwei) add a copypatch function for improve the efficiency on GPU.
Tensor ConcatenateColumns(const vector<Tensor> &in) {
size_t nrow = 0, ncol = 0;
CHECK(in.size());
for (const auto &x : in) {
CHECK(!x.transpose());
CHECK_EQ(x.nDim(), 2u);
ncol += x.shape(1);
if (nrow == 0)
nrow = x.shape(0);
else
CHECK_EQ(nrow, x.shape(0));
}
Tensor out(Shape{nrow, ncol}, in.at(0).device(), in.at(0).data_type());
for (size_t row = 0; row < nrow; row++) {
size_t dst_offset = row * ncol;
for (const auto &x : in) {
size_t src_offset = row * x.shape(1);
CopyDataToFrom(&out, x, x.shape(1), dst_offset, src_offset);
dst_offset += x.shape(1);
}
CHECK_EQ(dst_offset, row * ncol + ncol);
}
return out;
}
Tensor ConcatColumns(const vector<Tensor> &in) {
return ConcatenateColumns(in);
}
Tensor CopyRows(const Tensor &in, const size_t start, const size_t end) {
CHECK_LT(start, end);
CHECK_GE(in.shape(0), end) << "Tensor size must >= end";
Shape s = in.shape();
s[0] = end - start;
size_t sample_size = in.Size() / in.shape(0);
Tensor out(s, in.device(), in.data_type());
CopyDataToFrom(&out, in, out.Size(), 0, start * sample_size);
return out;
}
Tensor SliceOn(const Tensor &in, const size_t start, const size_t end,
int axis) {
Shape out_shape = in.shape();
out_shape[axis] = end - start;
if (axis == 0) {
auto ret = SliceRows(Reshape(in, {in.shape(0), in.Size() / in.shape(0)}),
start, end);
ret.Reshape(out_shape);
return ret;
} else {
size_t nrow = 1;
for (int i = 0; i < axis; i++) nrow *= in.shape(i);
auto suffix = in.Size() / nrow / in.shape(axis);
auto ret = SliceColumns(Reshape(in, {nrow, in.Size() / nrow}),
start * suffix, end * suffix);
ret.Reshape(out_shape);
return ret;
}
}
Tensor SliceRows(const Tensor &in, const size_t start, const size_t end) {
return CopyRows(in, start, end);
}
Tensor CopyColumns(const Tensor &in, const size_t start, const size_t end) {
CHECK_EQ(in.nDim(), 2u);
CHECK_LT(start, end);
CHECK_GE(in.shape(1), end);
Shape s{in.shape(0), end - start};
Tensor out(s, in.device(), in.data_type());
for (size_t row = 0; row < out.shape(0); row++) {
size_t src_offset = row * in.shape(1) + start;
size_t dst_offset = row * out.shape(1);
CopyDataToFrom(&out, in, end - start, dst_offset, src_offset);
}
return out;
}
Tensor SliceColumns(const Tensor &in, const size_t start, const size_t end) {
return CopyColumns(in, start, end);
}
/// Divide row 'v' by each row of matrix M; write results into 'out'
void DivRow(const Tensor &v, Tensor *M) {
Tensor inv;
TYPE_SWITCH(v.data_type(), DType, { inv = Div(DType(1), v); });
MultRow(inv, M);
}
/// Multiply column 'v' and each column of matrix M; write results into 'out'
void MultColumn(const Tensor &v, Tensor *M) {
// CHECK(!M->transpose()) << "Not supported yet";
CHECK_EQ(M->nDim(), 2u);
// CHECK_EQ(v.nDim(), 1u); (chonho) shape of v is 2-element tuple
CHECK_EQ(v.Size(), M->shape(0));
CheckDataTypeAndLang(*M, v);
TYPE_LANG_SWITCH(v.data_type(), DType, v.device()->lang(), Lang, {
Tensor &MRef = *M;
v.device()->Exec(
[MRef, v](Context *ctx) mutable {
DGMM<DType, Lang>(false, MRef, v, &MRef, ctx);
},
{M->block(), v.block()}, {M->block()}, "MultColumn");
});
}
/// Multiply row 'v' with each row of matrix M; write results into 'out'
void MultRow(const Tensor &v, Tensor *M) {
// CHECK(!M->transpose()) << "Not supported yet";
CHECK_EQ(M->nDim(), 2u);
// CHECK_EQ(v.nDim(), 1u); (chonho) shape of v is 2-element tuple
CHECK_EQ(v.Size(), M->shape(1));
CheckDataTypeAndLang(*M, v);
TYPE_LANG_SWITCH(v.data_type(), DType, v.device()->lang(), Lang, {
Tensor &MRef = *M;
v.device()->Exec(
[MRef, v](Context *ctx) mutable {
DGMM<DType, Lang>(true, MRef, v, &MRef, ctx);
},
{M->block(), v.block()}, {M->block()}, "MultRow");
});
}
void SubColumn(const Tensor &v, Tensor *M) { AddColumn(-1, 1, v, M); }
void SubRow(const Tensor &v, Tensor *M) { AddRow(-1, 1, v, M); }
void SumColumns(const Tensor &M, Tensor *v) {
if (M.transpose()) {
Tensor X = Transpose(M);
SumRows(X, v);
} else {
CHECK_EQ(M.nDim(), 2u);
// CHECK_EQ(v->nDim(), 1u); (chonho) shape of v is 2-element tuple
size_t nb_row = M.shape().at(0), nb_col = M.shape().at(1);
CHECK_EQ(nb_row, v->Size());
Tensor one(Shape{nb_col}, M.device(), M.data_type());
one.SetValue(1.0f); // TODO(wangwei) cast type
Mult(M, one, v);
}
}
void SumRows(const Tensor &M, Tensor *v) {
if (M.transpose()) {
Tensor X = Transpose(M);
SumColumns(X, v);
} else {
CHECK_EQ(M.nDim(), 2u);
// CHECK_EQ(v->nDim(), 1u); (chonho) shape of v is 2-element tuple
size_t nb_row = M.shape(0), nb_col = M.shape(1);
CHECK_EQ(nb_col, v->Size());
Tensor one(Shape{nb_row}, M.device(), M.data_type());
one.SetValue(1.0f); // TODO(wangwei) cast type
Tensor X = Transpose(M);
Mult(X, one, v);
}
}
// ====================Random operations=====================================
template <typename SType>
void Bernoulli(const SType p, Tensor *out) {
TYPE_LANG_SWITCH(out->data_type(), DType, out->device()->lang(), Lang, {
auto prob = TypeCast<SType, DType>(p);
Tensor &outRef = *out;
out->device()->Exec(
[prob, outRef](Context *ctx) mutable {
Bernoulli<DType, Lang>(prob, &outRef, ctx);
},
{}, {out->block()}, "Bernoulli", true);
});
}
template void Bernoulli<float>(const float p, Tensor *out);
template <typename SType>
void Uniform(const SType low, const SType high, Tensor *out) {
TYPE_LANG_SWITCH(out->data_type(), DType, out->device()->lang(), Lang, {
auto l = TypeCast<SType, DType>(low);
auto h = TypeCast<SType, DType>(high);
Tensor &outRef = *out;
out->device()->Exec(
[l, h, outRef](Context *ctx) mutable {
Uniform<DType, Lang>(l, h, &outRef, ctx);
},
{}, {out->block()}, "Uniform", true);
});
}
template void Uniform<float>(const float low, const float high, Tensor *out);
template <typename SType>
void Gaussian(const SType mean, const SType std, Tensor *out) {
TYPE_LANG_SWITCH(out->data_type(), DType, out->device()->lang(), Lang, {
auto m = TypeCast<SType, DType>(mean);
auto s = TypeCast<SType, DType>(std);
Tensor &outRef = *out;
out->device()->Exec(
[m, s, outRef](Context *ctx) mutable {
Gaussian<DType, Lang>(m, s, &outRef, ctx);
},
{}, {out->block()}, "Gaussian", true);
});
}
template void Gaussian<float>(const float mean, const float std, Tensor *out);
// ================Blas operations============================================
template <typename SType>
void Axpy(const SType alpha, const Tensor &in, Tensor *out) {
TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
auto a = TypeCast<SType, DType>(alpha);
Tensor &outRef = *out;
Tensor fake(*out);
out->device()->Exec(
[a, in, outRef, fake](Context *ctx) mutable {
Axpy<DType, Lang>(a, in, &outRef, ctx);
},
{in.block(), out->block()}, {out->block()}, "Axpy");
});
}
template void Axpy<float>(const float alpha, const Tensor &in, Tensor *out);
void Axpy(const Tensor &alpha, const Tensor &in, Tensor *out) {
TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
Tensor fake(*out);
Tensor &outRef = *out;
out->device()->Exec(
[alpha, in, outRef, fake](Context *ctx) mutable {
Axpy<DType, Lang>(alpha, in, &outRef, ctx);
},
{alpha.block(), in.block(), out->block()}, {out->block()}, "Axpy");
});
}
Tensor Mult(const Tensor &A, const Tensor &B) {
auto A_ = Broadcast(A, B.shape(), 2);
auto B_ = Broadcast(B, A.shape(), 2);
Shape s = A_.shape();
s.pop_back();
s.push_back(B.shape(B.nDim() - 1));
Tensor out(s, A.device(), A.data_type());
Mult(A_, B_, &out);
return out;
}
void Mult(const Tensor &A, const Tensor &B, Tensor *out) {
Mult(1.0f, A, B, 0.0f, out);
}
template <typename SType>
void Mult(const SType alpha, const Tensor &A, const Tensor &B, const SType beta,
Tensor *C) {
Tensor fakeC;
vector<Block *> read_blocks = {A.block(), B.block()};
if (beta) {
fakeC = *C;
read_blocks.push_back(C->block());
}
if (B.nDim() == 1u) {
CHECK_EQ(A.shape().size(), 2u);
TYPE_LANG_SWITCH(A.data_type(), DType, A.device()->lang(), Lang, {
auto a = TypeCast<SType, DType>(alpha);
auto b = TypeCast<SType, DType>(beta);
Tensor &CRef = *C;
C->device()->Exec(
[a, A, b, B, CRef, fakeC](Context *ctx) mutable {
GEMV<DType, Lang>(a, A, B, b, &CRef, ctx);
},
read_blocks, {C->block()}, "GEMV");
});
} else if (B.nDim() == 2u) {
CHECK_EQ(A.shape().size(), 2u);
CHECK(!C->transpose());
TYPE_LANG_SWITCH(A.data_type(), DType, A.device()->lang(), Lang, {
auto a = TypeCast<SType, DType>(alpha);
auto b = TypeCast<SType, DType>(beta);
Tensor &CRef = *C;
C->device()->Exec(
[a, A, b, B, CRef, fakeC](Context *ctx) mutable {
GEMM<DType, Lang>(a, A, B, b, &CRef, ctx);
},
read_blocks, {C->block()}, "GEMM");
});
} else if (B.nDim() == 3u || B.nDim() == 4u) {
CHECK_EQ(A.shape().size(), B.shape().size());
CHECK(!C->transpose());
TYPE_LANG_SWITCH(A.data_type(), DType, A.device()->lang(), Lang, {
auto a = TypeCast<SType, DType>(alpha);
auto b = TypeCast<SType, DType>(beta);
Tensor A_tmp;
Tensor B_tmp;
if (A.transpose() || A.broadcasted()) {
A_tmp = Tensor(A.shape(), A.device(), A.data_type());
singa::Transform(A, &A_tmp);
} else {
A_tmp = A;
}
if (B.transpose() || B.broadcasted()) {
B_tmp = Tensor(B.shape(), B.device(), B.data_type());
singa::Transform(B, &B_tmp);
} else {
B_tmp = B;
}
// batch GEMM should have same batch size
CHECK_EQ(A_tmp.shape(0), B_tmp.shape(0));
if (B.nDim() == 4u) CHECK_EQ(A_tmp.shape(1), B_tmp.shape(1));
Tensor &CRef = *C;
C->device()->Exec(
[a, A_tmp, b, B_tmp, CRef, fakeC](Context *ctx) mutable {
GEMMBatched<DType, Lang>(a, A_tmp, B_tmp, b, &CRef, ctx);
},
read_blocks, {C->block()}, "GEMMBatched");
});
} else {
LOG(FATAL) << "Un-supported tensor dimentions " << A.nDim() << "d matmul "
<< B.nDim() << "d\n";
}
}
// ************************
// Misc.
// ************************
Tensor CrossEntropyFwd(const Tensor &p, const Tensor &t) {
Tensor loss({p.shape(0)}, p.device(), p.data_type());
ComputeCrossEntropy(p, t, &loss);
return loss;
}
Tensor SoftmaxCrossEntropyBwd(const Tensor &p, const Tensor &t) {
Tensor g = p.Clone();
SoftmaxCrossEntropyBwd(t, &g);
return g;
}
void ComputeCrossEntropy(const Tensor &p, const Tensor &t, Tensor *loss) {
CHECK_LE(p.nDim(), 2u);
CHECK_LE(t.nDim(), 2u);
size_t batchsize = 1;
if (p.nDim() == 2u) batchsize = p.shape(0);
size_t dim = p.Size() / batchsize;
TYPE_LANG_SWITCH(p.data_type(), DType, p.device()->lang(), Lang, {
Tensor &lossRef = *loss;
p.device()->Exec(
[batchsize, dim, t, p, lossRef](Context *ctx) mutable {
bool int_target = t.Size() == batchsize;
ComputeCrossEntropy<DType, Lang>(int_target, batchsize, dim, p, t,
&lossRef, ctx);
},
{p.block(), t.block()}, {loss->block()}, "ComputeCrossEntropy");
});
}
void SoftmaxCrossEntropyBwd(const Tensor &t, Tensor *p) {
CHECK_LE(p->nDim(), 2u);
CHECK_LE(t.nDim(), 2u);
size_t batchsize = 1;
if (p->nDim() == 2u) batchsize = p->shape(0);
size_t dim = p->Size() / batchsize;
TYPE_LANG_SWITCH(p->data_type(), DType, p->device()->lang(), Lang, {
Tensor &pRef = *p;
Tensor pFake(*p); // just add a ref count
p->device()->Exec(
[batchsize, dim, t, pRef, pFake, p](Context *ctx) mutable {
bool int_target = t.Size() == batchsize;
SoftmaxCrossEntropyBwd<DType, Lang>(int_target, batchsize, dim, pRef,
t, &pRef, ctx);
},
{p->block(), t.block()}, {p->block()}, "SoftmaxCrossEntropyBackward");
});
}
Tensor &Tensor::Contiguous() {
if (transpose()) {
Tensor t(shape_, device_, data_type_);
singa::Transform(*this, &t);
std::swap(t.block_, block_);
}
return *this;
}
Tensor Contiguous(const Tensor &in) {
Tensor out(in);
return out.Contiguous();
}
// if tensor is not transposed yet, we change the shape and generate new stride
// if tensor is already transposed, we reallocate the memory and generate stride
Tensor &Tensor::Reshape(const Shape &shape) {
// Check original volumn with the new one
// do not use Product(shape_) due to stride 0 from broadcasting.
// printf("reshape loc b\n");
CHECK_EQ(Product(shape), Size());
if (transpose()) {
Tensor t(shape_, device_, data_type_);
singa::Transform(*this, &t);
std::swap(t.block_, block_);
shape_ = shape;
} else {
shape_ = shape;
}
generate_stride();
// printf("reshape loc c\n");
return *this;
}
Tensor Reshape(const Tensor &in, const Shape &s) {
// printf("reshape loc a\n");
Tensor out(in);
return out.Reshape(s);
}
} // namespace singa