src/core/tensor/tensor.cc - singa - Git at Google

 /**
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 #include "singa/core/tensor.h"
 // #include "singa/utils/stacktrace.h"
 #include "./tensor_math.h"
 #include "./tensor_math_cpp.h"
 #include "./tensor_math_cuda.h"
 #include "./tensor_math_opencl.h"
 #include <utility>
 #include <algorithm>


 #define Noaxis 9999

 namespace singa {

 Tensor::~Tensor() {
   if (block_ != nullptr && block_->DecRefCount() == 0)
     device_->FreeBlock(block_);
   block_ = nullptr;
 }

 Tensor::Tensor() {
   device_ = defaultDevice;
   stride_ = {1};
 }

 //non-strided constructors
 Tensor::Tensor(const Shape &shape, DataType dtype)
   : data_type_(dtype), device_(defaultDevice), shape_(shape) {
   size_t size = Product(shape_) * SizeOf(data_type_);
   if (size)
     block_ = device_->NewBlock((int)size);
   generate_stride();
 }


 //non-strided constructors with device
 Tensor::Tensor(const Shape &shape, std::shared_ptr<Device> device,
                DataType dtype)
   : data_type_(dtype), device_(device), shape_(shape) {
   size_t size = Product(shape_) * SizeOf(data_type_);
   if (size)
     block_ = device_->NewBlock((int)size);
   generate_stride();
 }


 Tensor::Tensor(const Tensor &in) : data_type_(in.data_type_),
   device_(in.device_),  block_(in.block()),  shape_(in.shape_),
   stride_(in.stride_) {
   if (block_ != nullptr)
     block_->IncRefCount();
 }


 Tensor::Tensor(Tensor &&in) : data_type_(in.data_type_),
   device_(in.device_), shape_(std::move(in.shape_)),
   stride_(std::move(in.stride_)) {
   block_ = in.block_;
   in.block_ = nullptr;
 }


 Tensor& Tensor::ResetLike(const Tensor &in) {
   if (block_ == nullptr || device_ != in.device_ || MemSize() != in.MemSize()) {
     if (block_ != nullptr && block_->DecRefCount() == 0)
       device_->FreeBlock(block_);
     device_ = in.device_;
     data_type_ = in.data_type_;
     block_ = device_->NewBlock((int)in.MemSize());
   }
   shape_ = in.shape_;
   stride_ = in.stride_;
   return *this;
 }

 Tensor& Tensor::Resize(const Shape& shape) {
   if (Size() != Product(shape)) {
     if (block_ != nullptr && block_->DecRefCount() == 0)
       device_->FreeBlock(block_);
     block_ = device_->NewBlock((int)(Product(shape) * SizeOf(data_type_)));
   }
   shape_ = shape;
   generate_stride();
   return *this;
 }

 Tensor Resize(const Tensor& in, const Shape& shape) {
   Tensor out(in);
   out.Resize(shape);
   return out;
 }


 Tensor& Tensor::AsType(const DataType type) {
   if (data_type_ != type) {
     if (block_ != nullptr && block_->DecRefCount() == 0)
       device_->FreeBlock(block_);
     block_ = device_->NewBlock((int)(Product(shape_) * SizeOf(type)));
     data_type_ = type;
   }
   return *this;
 }

 Tensor& Tensor::ToDevice(std::shared_ptr<Device> dst) {
   // TODO(wangwei) the comparison is restricted. May compare against device ID?
   if (device_ != dst) {
     Tensor tmp(shape_, dst, data_type_);
     if (block_ != nullptr && Size() && block_->initialized())
       tmp.CopyData(*this);
     if (block_ != nullptr && block_->DecRefCount() == 0)
       device_->FreeBlock(block_);
     block_ = tmp.block_;
     tmp.block_ = nullptr;
     device_ = dst;
   }
   return *this;
 }

 Tensor& Tensor::ToHost() {
   if (device_ != defaultDevice) ToDevice(device_->host());
   return *this;
 }

 template <typename DType>
 void Tensor::CopyDataFromHostPtr(const DType *src, const size_t num,
                                  const size_t offset) {
   CHECK_EQ(sizeof(DType), SizeOf(data_type_))
       << "data_type is " << DataType_Name(data_type_)
       << " user given type is of size " << sizeof(DType);
   if (src != nullptr) {
     device_->CopyDataFromHostPtr(block(), src, sizeof(DType) * num,
                                  sizeof(DType) * offset);
   } else {
     LOG(WARNING) << "Copy data from null host ptr";
   }
 }
 template void Tensor::CopyDataFromHostPtr(const unsigned char *src,
     const size_t num,
     const size_t offset);
 template void Tensor::CopyDataFromHostPtr(const float *src, const size_t num,
     const size_t offset);
 template void Tensor::CopyDataFromHostPtr(const int *src, const size_t num,
     const size_t offset);

 void Tensor::CopyData(const Tensor &src) {
   CHECK_EQ(Size(), src.Size());
   CHECK(block_ != nullptr);
   // Do copy only if the src's block is already initialized.
   if (src.block_ != nullptr) {
     singa::CopyDataToFrom(this, src, Size(), 0, 0);
   }
 }

 void Tensor::RepeatData(const vector<size_t>& repeats, int axis, int total_repeats,
                         const Tensor &src) {
   if (repeats.size() == 1) {
     CHECK_EQ(Size(), src.Size()*total_repeats);
   } else {
     CHECK_EQ(Size(), src.Size()*total_repeats / src.shape()[axis]);
   }

   CHECK(block_ != nullptr);
   // Do repeat only if the src's block is already initialized.
   if (src.block_ != nullptr) {
     singa::RepeatDataToFrom(false, repeats, axis, this, src, Size());
   }
 }

 void Tensor::FromProto(const singa::TensorProto &proto) {
   if (block_ != nullptr && block_->DecRefCount() == 0)
     device_->FreeBlock(block_);
   block_ = nullptr;
   for (uint32_t s : proto.shape()) shape_.push_back(s);
   data_type_ = proto.data_type();
   block_ = device_->NewBlock((int)(Product(shape()) * SizeOf(data_type_)));
   //transpose_ = proto.transpose();
   stride_.clear();
   for (int32_t s : proto.stride()) stride_.push_back(s);
   switch (data_type_) {
   case kFloat32: {
     std::unique_ptr<float[]> data_ptr(new float[Product(shape_)]);
     for (size_t i = 0; i < Product(shape_); ++i)
       data_ptr[i] = static_cast<float>(proto.float_data((int)i));
     CopyDataFromHostPtr<float>(data_ptr.get(), Product(shape_));
     break;
   }
   case kDouble: {
     std::unique_ptr<double[]> data(new double[Product(shape_)]);
     for (size_t i = 0; i < Product(shape_); ++i)
       data[i] = proto.double_data((int)i);
     CopyDataFromHostPtr<double>(data.get(), Product(shape_));
     break;
   }
   case kInt: {
     std::unique_ptr<int[]> data(new int[Product(shape_)]);
     for (size_t i = 0; i < Product(shape_); ++i) data[i] = proto.int_data((int)i);
     CopyDataFromHostPtr<int>(data.get(), Product(shape_));
     break;
   }
   ///TODO(wangji): Implement to support C++ type char using bytes type in protobuf
   /// which is equivalent to string type is different from the other cases. The kchar
   /// and kUChar case is to be implemented.
   /*
   case kChar: {
     std::unique_ptr<char[]> data(new char[Product(shape_)]);
     for (size_t i = 0; i < Product(shape_); ++i)
       data[i] = static_cast<char>(proto.bytes_data(i));
     break;
   }
   case kUChar: {
     std::unique_ptr<unsigned char[]> data(new unsigned char[Product(shape_)]);
     for (size_t i = 0; i < Product(shape_); ++i)
       data[i] = static_cast<unsigned char>(proto.bytes_data(i));
     break;
   }
   */
   default: { LOG(FATAL) << "Unsupported Type" << DataType_Name(data_type_); }
   }
 }

 void Tensor::ToProto(singa::TensorProto *proto) const {
   proto->clear_shape();
   for (auto s : shape_) {
     proto->add_shape(s);
   }
   proto->set_data_type(data_type_);
   //proto->set_transpose(transpose_);
   proto->clear_stride();
   for (auto s : stride_) {
     proto->add_stride(s);
   }
   switch (data_type_) {
   case kFloat32: {
     proto->clear_float_data();
     const float *data_ptr = data<float>();
     for (size_t i = 0; i < Product(shape_); ++i)
       proto->add_float_data(data_ptr[i]);
     break;
   }
   case kDouble: {
     proto->clear_double_data();
     const double *data_ptr = data<double>();
     for (size_t i = 0; i < Product(shape_); ++i)
       proto->add_double_data(data_ptr[i]);
     break;
   }
   case kInt: {
     proto->clear_int_data();
     const int *data_ptr = data<int>();
     for (size_t i = 0; i < Product(shape_); ++i)
       proto->add_int_data(data_ptr[i]);
     break;
   }
   /*
   case kChar: {
     proto->clear_bytes_data();
     const char *data = data<char>();
     for (size_t i = 0; i < Product(shape_); ++i)
       proto->add_bytes_data(static_cast<unsigned char>(data[i]));
     break;
   }
   case kUChar: {
     proto->clear_bytes_data();
     const unsigned char *data = data<unsigned char>();
     for (size_t i = 0; i < Product(shape_); ++i)
       proto->add_bytes_data(static_cast<unsigned char>(data[i]));
     break;
   }
   */
   default: { LOG(FATAL) << "Unsupported Type" << DataType_Name(data_type_); }
   }
 }

 Tensor Tensor::Repeat(const vector<size_t>& repeats, int axis,
                       std::shared_ptr<Device> device) {
   if (device == nullptr) device = device_;
   vector<size_t> tshape;
   int total_repeats = 0;
   if (axis == Noaxis) {
     total_repeats = repeats[0];
     tshape.push_back(Product(shape_)*total_repeats);
   } else {
     if (repeats.size() == 1) {
       total_repeats = repeats[0];
       for (int i = 0; i < static_cast<int>(shape_.size()); i++) {
         if (i == axis) {
           tshape.push_back(shape_[i] * total_repeats);
         } else {
           tshape.push_back(shape_[i]);
         }
       }
     } else {
       if (repeats.size() != shape_[axis]) {
         LOG(FATAL) << "the repeats number doesn't match the axis";
       }
       for (size_t i = 0; i < shape_[axis]; i++) {
         if (repeats[i] < 0) {
           LOG(FATAL) << "the repeats number is less than zero";
         }
         total_repeats += repeats[i];
       }
       for (int i = 0; i < static_cast<int>(shape_.size()); i++) {
         if (i == axis) {
           tshape.push_back(total_repeats);
         } else {
           tshape.push_back(shape_[i]);
         }
       }
     }
   }
   Tensor t(tshape, device_);
   //t.stride_.push_back(1);
   t.RepeatData(repeats, axis, total_repeats, *this);
   return t;
 }

 Tensor Tensor::Clone(std::shared_ptr<Device> device) const {
   if (device == nullptr) device = device_;
   Tensor t(shape_, device_, data_type_);
   //t.transpose_ = transpose_;
   t.stride_ = stride_;
   t.CopyData(*this);
   return t;
 }

 Tensor& Tensor::Broadcast(const Shape& shape) {
   // TODO(wangwei) do we need to transform the mem layout if the tensor was
   // transposed?
   auto m = shape_.size() - 1, n = shape.size() - 1;
   for (size_t i = 0; i <= std::min(m, n); i++) {
     if ((shape.at(n - i) != shape_.at(m - i)) && (shape.at(n - i) != 1)) {
       CHECK_EQ(shape_.at(m - i), 1) << "i= " << i << "\n"; // << Backtrace();
       shape_.at(m - i) = shape.at(n - i);
       stride_.at(m - i) = 0;
     }
   }
   if (m < n) {
     for (size_t i = m + 1; i <= n; i++) {
       shape_.emplace(shape_.begin(), shape.at(n - i));
       stride_.emplace(stride_.begin(), 0);
     }
   }
   return *this;
 }

 Tensor Broadcast(const Tensor& in, const Shape& shape) {
   Tensor out(in);
   return out.Broadcast(shape);
 }

 Tensor& Tensor::T() {
   // this function only works for 2d tensors
   CHECK_EQ(shape_.size(), 2u);
   Transpose();
   return *this;
 }

 //normal transpose without axes
 Tensor& Tensor::Transpose() {
   std::reverse(shape_.begin(), shape_.end());
   std::reverse(stride_.begin(), stride_.end());
   return *this;
 }

 //transpose with axes
 Tensor& Tensor::Transpose(const vector<size_t> &axes) {
   CHECK_EQ(axes.size(), shape_.size()) <<
                                        "Tranpose axes's length should be equal to shape";

   auto shape = shape_;
   auto stride = stride_;
   shape_.clear();
   stride_.clear();
   for (size_t n = 0; n < axes.size(); ++n) {
     shape_.push_back(shape[axes[n]]);
     stride_.push_back(stride[axes[n]]);
   }
   return *this;
 }

 //normal transpose without axes
 Tensor Transpose(const Tensor& in) {
   Tensor out(in);
   out.Transpose();
   return out;
 }

 //transpose with axes
 Tensor Transpose(const Tensor& in, const vector<size_t> &axes) {
   Tensor out(in);
   out.Transpose(axes);
   return out;
 }

 Tensor &Tensor::operator=(const Tensor &in) {
   if (block_ != nullptr && block_->DecRefCount() == 0)
     device_->FreeBlock(block_);
   stride_ = in.stride_;
   data_type_ = in.data_type_;
   shape_ = in.shape_;
   device_ = in.device_;
   block_ = in.block();
   if (block_ != nullptr)
     block_->IncRefCount();
   return *this;
 }

 Tensor &Tensor::operator=(Tensor &&in) {
   if (block_ != nullptr && block_->DecRefCount() == 0)
     device_->FreeBlock(block_);
   stride_ = std::move(in.stride_);
   data_type_ = in.data_type_;
   shape_ = std::move(in.shape_);
   device_ = in.device_;
   block_ = in.block_;
   in.block_ = nullptr;
   return *this;
 }


 #define GenUnaryTensorArgMemberFn(op, fn) \
   Tensor &Tensor::op(const Tensor &in) {  \
     fn(*this, in, this);                  \
     return *this;                         \
   }

 GenUnaryTensorArgMemberFn(operator+=, Add);
 GenUnaryTensorArgMemberFn(operator-=, Sub);
 GenUnaryTensorArgMemberFn(operator*=, EltwiseMult);
 GenUnaryTensorArgMemberFn(operator/=, Div);

 #define GenUnaryScalarArgMemberFn(op, fn) \
   template <typename DType>               \
   Tensor &Tensor::op(const DType x) {     \
     fn(*this, x, this);                   \
     return *this;                         \
   }                                       \
   template Tensor &Tensor::op<float>(const float x)

 GenUnaryScalarArgMemberFn(operator-=, Sub);
 GenUnaryScalarArgMemberFn(operator+=, Add);
 GenUnaryScalarArgMemberFn(operator*=, EltwiseMult);
 GenUnaryScalarArgMemberFn(operator/=, Div);

 // ====================Tensor Operations=======================================
 void CopyDataToFrom(Tensor *dst, const Tensor &src, const size_t num,
                     const size_t dst_offset, const size_t src_offset) {
   auto width = SizeOf(src.data_type());
   CHECK_EQ(width, SizeOf(dst->data_type()));
   size_t nBytes = num * width;
   auto d_offset = dst_offset * width;
   auto s_offset = src_offset * width;
   CHECK_GE(src.MemSize(), s_offset + nBytes);
   CHECK_GE(dst->MemSize(), d_offset + nBytes);

   std::shared_ptr<Device> src_dev = src.device(), dst_dev = dst->device();
   Block *from = src.block(), *to = dst->block();
   if (dst_dev->lang() != src_dev->lang()) {
     // let the none cpp device conduct copy op
     if (dst_dev->lang() == kCpp) {
       src_dev->CopyDataToFrom(to, from, nBytes, kDeviceToHost, (int)d_offset,
                               (int)s_offset);
     } else if (src_dev->lang() == kCpp) {
       dst_dev->CopyDataToFrom(to, from, nBytes, kHostToDevice, (int)d_offset,
                               (int)s_offset);
     } else {
       LOG(FATAL) << "Not support mem copy betwee Cuda and OpenCL device";
     }
   } else {
     auto direct = src_dev->lang() == kCpp ? kHostToHost : kDeviceToDevice;
     src_dev->CopyDataToFrom(to, from, nBytes, direct, (int)d_offset, (int)s_offset);
   }
 }

 void RepeatDataToFrom(bool broadcast_flag, const vector<size_t>& repeats, int axis,
                       Tensor *dst, const Tensor &src, const size_t num) {
   if (repeats.size() == 1) {
     broadcast_flag = true;
   } else if (repeats.size() > 1) {
     if (axis == Noaxis) {
       LOG(FATAL) << "When repeats parameter is sequence, axis cannot be None";
     }
   }
   for (size_t i = 0; i < repeats.size(); i++) {
     CHECK_GE(repeats[i], 0);
   }
   auto width = SizeOf(src.data_type());
   CHECK_EQ(width, SizeOf(dst->data_type()));
   // size_t nBytes = num * width;
   int chunk = width;
   int axis_shape = 1;
   int shape_outer = 1;
   if (axis == Noaxis) {
     axis_shape = 1;
     shape_outer = Product(src.shape());
   } else {
     for (int i = 0; i < axis; i++) {
       shape_outer *= src.shape()[i];
     }
     axis_shape = src.shape()[axis];
     for (int i = axis + 1; i < static_cast<int>(src.nDim()); i++) {
       chunk *= src.shape()[i];
     }
   }
   int dst_offset = 0;
   int src_offset = 0;
   std::shared_ptr<Device> src_dev = src.device(), dst_dev = dst->device();
   Block *from = src.block(), *to = dst->block();
   for (int i = 0; i < shape_outer; i++) {
     for (int j = 0; j < axis_shape; j++) {
       int temp = broadcast_flag ? repeats[0] : repeats[j];
       for (int k = 0; k < temp; k++) {
         if (dst_dev->lang() != src_dev->lang()) {
           // let the none cpp device conduct copy op
           if (dst_dev->lang() == kCpp) {
             src_dev->CopyDataToFrom(to, from, chunk, kDeviceToHost, dst_offset, src_offset);
           } else if (src_dev->lang() == kCpp) {
             dst_dev->CopyDataToFrom(to, from, chunk, kHostToDevice, dst_offset, src_offset);
           } else {
             LOG(FATAL) << "Not support mem repeat copy betwee Cuda and OpenCL device";
           }
         } else {
           auto direct = src_dev->lang() == kCpp ? kHostToHost : kDeviceToDevice;
           src_dev->CopyDataToFrom(to, from, chunk, direct, dst_offset, src_offset);
         }
         dst_offset += chunk;
       }
       src_offset += chunk;
     }
   }
 }

 //============================================================================
 /// typedef DType accroding to type value.
 /// DType would be used in the code block __VA_ARGS__.
 #define TYPE_SWITCH(type, DType, ...)                               \
   do {                                                              \
     switch (type) {                                                 \
       case kFloat32: {                                              \
         typedef float DType;                                        \
         { __VA_ARGS__ }                                             \
         break;                                                      \
       }                                                             \
       case kInt: {                                                  \
         typedef int DType;                                          \
         { __VA_ARGS__ }                                             \
         break;                                                      \
       }                                                             \
       case kChar: {                                                 \
         typedef char DType;                                         \
         { __VA_ARGS__ }                                             \
         break;                                                      \
       }                                                             \
       case kDouble: {                                               \
         typedef double DType;                                       \
         { __VA_ARGS__ }                                             \
         break;                                                      \
       }                                                             \
       default:                                                      \
         LOG(FATAL) << "Unknow data type = " << DataType_Name(type); \
     }                                                               \
   } while (0)

 /// typedef DType and Lang according to data type and device programming
 /// language respectively.
 /// type is from DataType, and lang is from LangType.
 /// DType and Lang would be used in __VA_ARGS__.
 #define TYPE_LANG_SWITCH(dtype, DType, ltype, Lang, ...)       \
   do {                                                         \
     const int _SwitchShift = 3;                                \
     int _SwitchHash = ((dtype) << _SwitchShift) + (ltype);     \
     switch (_SwitchHash) {                                     \
       case ((kFloat32 << _SwitchShift) + kCuda): {             \
         typedef float DType;                                   \
         typedef lang::Cuda Lang;                               \
         { __VA_ARGS__ }                                        \
         break;                                                 \
       }                                                        \
       case ((kFloat32 << _SwitchShift) + kCpp): {              \
         typedef float DType;                                   \
         typedef lang::Cpp Lang;                                \
         { __VA_ARGS__ }                                        \
         break;                                                 \
       }                                                        \
       case ((kFloat32 << _SwitchShift) + kOpencl): {           \
         typedef float DType;                                   \
         typedef lang::Opencl Lang;                             \
         { __VA_ARGS__ }                                        \
         break;                                                 \
       }                                                        \
       default:                                                 \
         LOG(FATAL) << "Unknown combination of data type "      \
                    << DataType_Name(dtype) << " and language " \
                    << LangType_Name(ltype);                    \
     }                                                          \
   } while (0)

 // =============Element-wise operations====================================
 float Tensor::L1() const {
   float nrm = 0.0f;
   TYPE_LANG_SWITCH(data_type_, DType, device_->lang(), Lang, {
     device_->Exec([&nrm, this](Context * ctx) {
       DType ret = DType(0);
       Asum<DType, Lang>(*this, &ret, ctx);
       nrm = TypeCast<DType, float>(ret);
     }, {this->block()}, {});
   });
   return nrm / Size();
 }

 /// L2 norm, Do not use Nrm2 (name conflict).
 float Tensor::L2() const {
   float nrm = 0.0f;
   TYPE_LANG_SWITCH(data_type_, DType, device_->lang(), Lang, {
     device_->Exec([&nrm, this](Context * ctx) {
       DType ret = DType(0);
       Nrm2<DType, Lang>(*this, &ret, ctx);
       nrm = TypeCast<DType, float>(ret);
     }, {this->block()}, {});
   });
   return nrm / Size();
 }

 template <typename SType>
 void Tensor::SetValue(const SType x) {
   CHECK_EQ(sizeof(SType), SizeOf(data_type_));
   //auto size = Size();
   auto ptr = block_;

   TYPE_LANG_SWITCH(data_type_, DType, device_->lang(), Lang, {
     // TODO(wangwei) cast x to DType
     device_->Exec([this, x, ptr](Context * ctx) {
       Set<DType, Lang>(x, this, ctx);
     }, {}, {ptr});
   });
 }
 template void Tensor::SetValue<float>(const float x);
 template void Tensor::SetValue<int>(const int x);

 #define EltwiseUnaryTensorFn(fn, t, ret)                               \
   do {                                                                 \
     TYPE_LANG_SWITCH(t.data_type(), DType, t.device()->lang(), Lang, { \
       ret->device()->Exec([t, ret](Context * ctx) {                    \
         fn<DType, Lang>(t, ret, ctx);       \
       }, {t.block()}, {ret->block()});                                 \
     });                                                                \
   } while (0)

 #define GenUnaryTensorFn(fn)                             \
   Tensor fn(const Tensor &in) {                          \
     Tensor ret(in.shape(), in.device(), in.data_type()); \
     auto *retptr = &ret;                                 \
     EltwiseUnaryTensorFn(fn, in, retptr);                \
     return ret;                                          \
   }                                                      \
   void fn(const Tensor &in, Tensor *out) { EltwiseUnaryTensorFn(fn, in, out); }

 GenUnaryTensorFn(Abs);
 GenUnaryTensorFn(Exp);
 GenUnaryTensorFn(Log);
 GenUnaryTensorFn(ReLU);
 GenUnaryTensorFn(Sigmoid);
 GenUnaryTensorFn(Sign);
 GenUnaryTensorFn(Sqrt);
 GenUnaryTensorFn(Square);
 GenUnaryTensorFn(Tanh);
 GenUnaryTensorFn(Transform);

 #define EltwiseBinaryTensorFn(fn, lhs, rhs, ret)                            \
   do {                                                                      \
     TYPE_LANG_SWITCH(lhs.data_type(), DType, lhs.device()->lang(), Lang, {  \
       CHECK_EQ(sizeof(DType), SizeOf(rhs.data_type()));                     \
       ret->device()->Exec([lhs, rhs, ret](Context * ctx) {                  \
         fn<DType, Lang>(lhs, rhs, ret, \
                         ctx);                                               \
       }, {lhs.block(), rhs.block()}, {ret->block()});                       \
     });                                                                     \
   } while (0)

 #define GenBinaryTensorFn(op, fn)                              \
   Tensor op(const Tensor &lhs, const Tensor &rhs) {            \
     if (lhs.shape() != rhs.shape()) {                          \
       auto lhs_ = Broadcast(lhs, rhs.shape());                 \
       auto rhs_ = Broadcast(rhs, lhs.shape());                 \
       Tensor ret(lhs_.shape(), lhs.device(), lhs.data_type()); \
       fn(lhs_, rhs_, &ret);                                      \
       return ret;                                              \
     } else {                                                   \
       Tensor ret(lhs.shape(), lhs.device(), lhs.data_type());  \
       fn(lhs, rhs, &ret);                                      \
       return ret;                                              \
     }                                                          \
   }                                                            \
   void fn(const Tensor &lhs, const Tensor &rhs, Tensor *ret) { \
     CHECK_EQ(lhs.device(), ret->device());                     \
     CHECK_EQ(rhs.device(), ret->device());                     \
     if (lhs.shape() != rhs.shape()) {                          \
       auto lhs_ = Broadcast(lhs, rhs.shape());                 \
       auto rhs_ = Broadcast(rhs, lhs.shape());                 \
       CHECK(lhs_.shape() == ret->shape());                    \
       EltwiseBinaryTensorFn(fn, lhs_, rhs_, ret);              \
     } else {                                                   \
       CHECK(lhs.shape() == ret->shape());                      \
       EltwiseBinaryTensorFn(fn, lhs, rhs, ret);                \
     }                                                          \
   }

 // boradcasting operations: https://github.com/onnx/onnx/blob/master/docs/Broadcasting.md
 GenBinaryTensorFn(operator+, Add);
 GenBinaryTensorFn(operator-, Sub);
 GenBinaryTensorFn(operator*, EltwiseMult);
 GenBinaryTensorFn(operator/, Div);
 GenBinaryTensorFn(Pow, Pow);
 GenBinaryTensorFn(operator<, LT);
 GenBinaryTensorFn(operator<=, LE);
 GenBinaryTensorFn(operator>, GT);
 GenBinaryTensorFn(operator>=, GE);

 #define EltwiseTensorScalarFn(fn, t, x, ret)                            \
   do {                                                                  \
     TYPE_LANG_SWITCH(t.data_type(), DType, t.device()->lang(), Lang, {  \
       static_assert(std::is_same<SType, DType>::value,                  \
                     "The Scalar type must match the Tensor data type"); \
       ret->device()->Exec([t, x, ret](Context * ctx) {                  \
         fn<DType, Lang>(t, x, ret, ctx);     \
       }, {t.block()}, {ret->block()});                                  \
     });                                                                 \
   } while (0)

 #define GenTensorScalarFn(op, fn)                             \
   template <typename SType>                                   \
   Tensor op(const Tensor &in, const SType x) {                \
     Tensor ret(in.shape(), in.device(), in.data_type());      \
     fn(in, x, &ret);                                          \
     return ret;                                               \
   }                                                           \
   template <typename SType>                                   \
   void fn(const Tensor &in, const SType x, Tensor *ret) {     \
     EltwiseTensorScalarFn(fn, in, x, ret);                    \
   }                                                           \
   template Tensor op <float>(const Tensor &in, const float x); \
   template void fn<float>(const Tensor &in, const float x, Tensor *ret)

 GenTensorScalarFn(operator+, Add);
 GenTensorScalarFn(operator-, Sub);
 GenTensorScalarFn(operator*, EltwiseMult);
 GenTensorScalarFn(operator/, Div);
 GenTensorScalarFn(Pow, Pow);
 GenTensorScalarFn(operator<, LT);
 GenTensorScalarFn(operator<=, LE);
 GenTensorScalarFn(operator>, GT);
 GenTensorScalarFn(operator>=, GE);
 template <typename SType>
 Tensor Div(const SType alpha, const Tensor &in) {
   Tensor out(in.shape(), in.device(), in.data_type());
   Div(alpha, in, &out);
   return out;
 }
 template Tensor Div<float>(const float, const Tensor &);

 template <typename SType>
 void Div(const SType alpha, const Tensor &in, Tensor *out) {
   CheckDataTypeAndLang(in, *out);
   CHECK(in.shape() == out->shape());
   TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
     // TODO(wangwei) type cast SType to DType;
     in.device()->Exec([alpha, in, out](Context * ctx) {
       Div<DType, Lang>(alpha, in, out, ctx);
     }, {in.block()}, {out->block()});
   });
 }
 template void Div<float>(const float, const Tensor &, Tensor *);

 // =============Matrix operations============================================
 Tensor Average(const Tensor &M, int axis) {
   // operator/ only has implementation for float scalar type, hence it is
   // necessary to cast the denominator to a float.
   // TODO(wangwei) implement function for cast scalar type involved in Tensor
   // functions. E.g.,
   // template<S, D>
   // D CastTo(S x) {
   //   return D(x);
   // }
   // for speical types, e.g., fp16:
   // tempalte<>
   // fp16 CastType(float x) {
   //    ....
   // }
   if (axis == 0) {
     return Sum(M, 0) / (1.0f * M.shape(0));
   } else {
     CHECK_EQ(axis, 1);
     return Sum(M, 1) / (1.0f * M.shape(1));
   }
 }
 // TODO(wangwei) conside async exec
 template <>
 float Sum<float>(const Tensor &in) {
   float s = 0.0f;
   Tensor one(in.shape(), in.device(), in.data_type());
   one.SetValue(1.0f);
   TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
     one.device()->Exec([in, one, &s](Context * ctx) {
       DType ret = DType(0);
       Dot<DType, Lang>(in, one, &ret, ctx);
       s = ret;
     }, {in.block(), one.block()}, {});
   });
   return s;
 }

 Tensor Sum(const Tensor &M, int axis) {
   if (axis == 0) {
     Tensor out(Shape{M.shape(1)}, M.device(), M.data_type());
     SumRows(M, &out);
     return out;
   } else {
     CHECK_EQ(axis, 1) << "Not support Sum over axis = " << axis;
     Tensor out(Shape{M.shape(0)}, M.device(), M.data_type());
     SumColumns(M, &out);
     return out;
   }
 }

 Tensor SoftMax(const Tensor &in) {
   Tensor out(in.shape(), in.device(), in.data_type());
   SoftMax(in, &out);
   return out;
 }

 Tensor RowMax(const Tensor &in) {
   Tensor ret({in.shape(0)}, in.device(), in.data_type());
   TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
     in.device()->Exec([&in, &ret](Context * ctx) {
       //size_t nrow = 1;
       //if (in.nDim() > 1) nrow = in.shape(0);
       //size_t ncol = in.Size() / nrow;
       RowMax<DType, Lang>(in, &ret, ctx);
     }, {in.block()}, {ret.block()});
   });
   return ret;
 }

 void SoftMax(const Tensor &in, Tensor *out) {
   CHECK_LE(in.nDim(), 2u);
   out->CopyData(in);
   size_t nrow = 1, ncol = in.Size(), size = ncol;
   if (in.nDim() == 2u) {
     nrow = in.shape(0);
     ncol = size / nrow;
     out->Reshape(Shape{nrow, ncol});
   }
   Tensor tmp = RowMax(*out);
   SubColumn(tmp, out);
   Exp(*out, out);

   SumColumns(*out, &tmp);
   DivColumn(tmp, out);
   out->Reshape(in.shape());
 }

 void AddColumn(const Tensor &v, Tensor *M) { AddColumn(1, 1, v, M); }
 /// Add column 'v' onto each column of matrix M;
 template <typename SType>
 void AddColumn(const SType alpha, const SType beta, const Tensor &v,
                Tensor *M) {
   if (M->transpose()) {
     Tensor X = Transpose(*M);
     AddRow(v, &X);
   } else {
     CHECK_EQ(M->nDim(), 2u);
     // CHECK_EQ(v.nDim(), 1u); (chonho) shape of v is 2-element tuple
     size_t nb_row = M->shape(0), nb_col = M->shape(1);
     CHECK_EQ(nb_row, v.Size());

     Tensor one(Shape{1, nb_col}, M->device(), M->data_type());
     one.SetValue(1.0f);  // TODO(wangwei) cast type
     Tensor vmat = Reshape(v, Shape{nb_row, 1});
     Mult(alpha, vmat, one, beta, M);
   }
 }
 template
 void AddColumn(const float alpha, const float beta, const Tensor &v, Tensor *M);

 void AddRow(const Tensor &v, Tensor *M) { AddRow(1, 1, v, M); }

 /// Add row 'v' by each column of matrix M; write results into 'out'
 template <typename SType>
 void AddRow(const SType alpha, const SType beta, const Tensor &v, Tensor *M) {
   if (M->transpose()) {
     Tensor X = Transpose(*M);
     AddColumn(v, &X);
   } else {
     CHECK_EQ(M->nDim(), 2u);
     // CHECK_EQ(v.nDim(), 1u); (chonho) shape of v is 2-element tuple
     size_t nb_row = M->shape(0), nb_col = M->shape(1);
     CHECK_EQ(nb_col, v.Size());

     Tensor one(Shape{nb_row, 1}, M->device(), M->data_type());
     one.SetValue(1.0f);
     Tensor vmat = Reshape(v, Shape{1, nb_col});
     Mult(alpha, one, vmat, beta, M);
   }
 }
 template void AddRow(const float alpha, const float beta, const Tensor &v,
                      Tensor *M);

 /// Divide column 'v' by each column of matrix M; write results into 'out'
 void DivColumn(const Tensor &v, Tensor *M) {
   Tensor inv;
   TYPE_SWITCH(v.data_type(), DType, { inv = Div(DType(1), v); });
   MultColumn(inv, M);
 }

 Tensor ConcatOn(const std::vector<Tensor> &in, int axis) {
   vector<Tensor> tmp;
   Shape out_shape = in[0].shape();
   size_t dim = in[0].shape().size();
   CHECK_GE(dim, 2u) << " Only work for tensor of dim >=2 ";
   size_t size = in[0].Size() / in[0].shape(axis);
   size_t new_size = 0u;
   for (const auto& t : in) {
     CHECK_EQ(dim, t.shape().size()) << "All tensors should have the same dim";
     CHECK_EQ(size, t.Size() / t.shape(axis)) << "The size of all axis should "
         << " be the same except the concatenated axis";
     new_size += t.shape(axis);
   }
   out_shape[axis] = new_size;
   if (axis == 0) {
     size_t nrow = 0;
     for (const auto& t : in) {
       nrow += t.shape(0);
       tmp.push_back(Reshape(t, {t.shape(0), t.Size() / t.shape(0)}));
     }
     auto ret = ConcatenateRows(tmp);
     ret.Reshape(out_shape);
     return ret;
   } else {
     for (const auto& t : in) {
       size_t nrow = 1;
       for (int i = 0; i < axis; i++)
         nrow *= t.shape(i);
       tmp.push_back(Reshape(t, {nrow, t.Size() / nrow}));
     }
     auto ret = ConcatenateColumns(tmp);
     ret.Reshape(out_shape);
     return ret;
   }
 }

 Tensor ConcatenateRows(const vector<Tensor> &in) {
   size_t nrow = 0, ncol = 0;
   CHECK(in.size());
   for (const auto &x : in) {
     CHECK(!x.transpose());
     CHECK_EQ(x.nDim(), 2u);
     nrow += x.shape(0);
     if (ncol == 0)
       ncol = x.shape(1);
     else
       CHECK_EQ(ncol, x.shape(1));
   }
   Tensor out(Shape{nrow, ncol}, in.at(0).device(), in.at(0).data_type());
   size_t dst_offset = 0;
   for (const auto &x : in) {
     CopyDataToFrom(&out, x, x.Size(), dst_offset, 0);
     dst_offset += x.Size();
   }
   return out;
 }
 Tensor ConcatRows(const vector<Tensor> &in) {
   return ConcatenateRows(in);
 }
 // TODO(wangwei) add a copypatch function for improve the efficiency on GPU.
 Tensor ConcatenateColumns(const vector<Tensor> &in) {
   size_t nrow = 0, ncol = 0;
   CHECK(in.size());
   for (const auto &x : in) {
     CHECK(!x.transpose());
     CHECK_EQ(x.nDim(), 2u);
     ncol += x.shape(1);
     if (nrow == 0)
       nrow = x.shape(0);
     else
       CHECK_EQ(nrow, x.shape(0));
   }
   Tensor out(Shape{nrow, ncol}, in.at(0).device(), in.at(0).data_type());
   for (size_t row = 0; row < nrow; row++) {
     size_t dst_offset = row * ncol;
     for (const auto &x : in) {
       size_t src_offset = row * x.shape(1);
       CopyDataToFrom(&out, x, x.shape(1), dst_offset, src_offset);
       dst_offset += x.shape(1);
     }
     CHECK_EQ(dst_offset, row * ncol + ncol);
   }
   return out;
 }
 Tensor ConcatColumns(const vector<Tensor> &in) {
   return ConcatenateColumns(in);
 }

 Tensor CopyRows(const Tensor &in, const size_t start, const size_t end) {
   CHECK_LT(start, end);
   CHECK_GE(in.shape(0), end) << "Tensor size must >= end";
   Shape s = in.shape();
   s[0] = end - start;
   size_t sample_size = in.Size() / in.shape(0);
   Tensor out(s, in.device(), in.data_type());
   CopyDataToFrom(&out, in, out.Size(), 0, start * sample_size);
   return out;
 }


 Tensor SliceOn(const Tensor&in, const size_t start, const size_t end,
                int axis) {
   Shape out_shape = in.shape();
   out_shape[axis] = end - start;
   if (axis == 0) {
     auto ret = SliceRows(Reshape(in, {in.shape(0), in.Size() / in.shape(0)}),
                          start, end);
     ret.Reshape(out_shape);
     return ret;
   } else {
     size_t nrow = 1;
     for (int i = 0; i < axis; i++)
       nrow *= in.shape(i);
     auto suffix = in.Size() / nrow / in.shape(axis);
     auto ret = SliceColumns(Reshape(in, {nrow, in.Size() / nrow}),
                             start * suffix, end * suffix);
     ret.Reshape(out_shape);
     return ret;
   }
 }

 Tensor SliceRows(const Tensor &in, const size_t start, const size_t end) {
   return CopyRows(in, start, end);
 }

 Tensor CopyColumns(const Tensor &in, const size_t start, const size_t end) {
   CHECK_EQ(in.nDim(), 2u);
   CHECK_LT(start, end);
   CHECK_GE(in.shape(1), end);
   Shape s{in.shape(0), end - start};
   Tensor out(s, in.device(), in.data_type());
   for (size_t row = 0; row < out.shape(0); row++) {
     size_t src_offset = row * in.shape(1) + start;
     size_t dst_offset = row * out.shape(1);
     CopyDataToFrom(&out, in, end - start, dst_offset, src_offset);
   }
   return out;
 }

 Tensor SliceColumns(const Tensor &in, const size_t start, const size_t end) {
   return CopyColumns(in, start, end);
 }


 /// Divide row 'v' by each row of matrix M; write results into 'out'
 void DivRow(const Tensor &v, Tensor *M) {
   Tensor inv;
   TYPE_SWITCH(v.data_type(), DType, { inv = Div(DType(1), v); });
   MultRow(inv, M);
 }

 /// Multiply column 'v' and each column of matrix M; write results into 'out'
 void MultColumn(const Tensor &v, Tensor *M) {
   //CHECK(!M->transpose()) << "Not supported yet";
   CHECK_EQ(M->nDim(), 2u);
   // CHECK_EQ(v.nDim(), 1u); (chonho) shape of v is 2-element tuple
   CHECK_EQ(v.Size(), M->shape(0));
   CheckDataTypeAndLang(*M, v);
   TYPE_LANG_SWITCH(v.data_type(), DType, v.device()->lang(), Lang, {
     v.device()->Exec([M, v](Context * ctx) {
       DGMM<DType, Lang>(false, *M, v,
       M, ctx);
     }, {M->block(), v.block()}, {M->block()});
   });
 }

 /// Multiply row 'v' with each row of matrix M; write results into 'out'
 void MultRow(const Tensor &v, Tensor *M) {
   //CHECK(!M->transpose()) << "Not supported yet";
   CHECK_EQ(M->nDim(), 2u);
   // CHECK_EQ(v.nDim(), 1u); (chonho) shape of v is 2-element tuple
   CHECK_EQ(v.Size(), M->shape(1));
   CheckDataTypeAndLang(*M, v);
   TYPE_LANG_SWITCH(v.data_type(), DType, v.device()->lang(), Lang, {
     v.device()->Exec([M, v](Context * ctx) {
       DGMM<DType, Lang>(true, *M, v,
       M, ctx);
     }, {M->block(), v.block()}, {M->block()});
   });
 }

 void SubColumn(const Tensor &v, Tensor *M) { AddColumn(-1, 1, v, M); }

 void SubRow(const Tensor &v, Tensor *M) { AddRow(-1, 1, v, M); }

 void SumColumns(const Tensor &M, Tensor *v) {
   if (M.transpose()) {
     Tensor X = Transpose(M);
     SumRows(X, v);
   } else {
     CHECK_EQ(M.nDim(), 2u);
     // CHECK_EQ(v->nDim(), 1u); (chonho) shape of v is 2-element tuple
     size_t nb_row = M.shape().at(0), nb_col = M.shape().at(1);
     CHECK_EQ(nb_row, v->Size());

     Tensor one(Shape{nb_col}, M.device(), M.data_type());
     one.SetValue(1.0f);  // TODO(wangwei) cast type
     Mult(M, one, v);
   }
 }
 void SumRows(const Tensor &M, Tensor *v) {
   if (M.transpose()) {
     Tensor X = Transpose(M);
     SumColumns(X, v);
   } else {
     CHECK_EQ(M.nDim(), 2u);
     // CHECK_EQ(v->nDim(), 1u); (chonho) shape of v is 2-element tuple
     size_t nb_row = M.shape(0), nb_col = M.shape(1);
     CHECK_EQ(nb_col, v->Size());

     Tensor one(Shape{nb_row}, M.device(), M.data_type());
     one.SetValue(1.0f);  // TODO(wangwei) cast type
     Tensor X = Transpose(M);
     Mult(X, one, v);
   }
 }
 // ====================Random operations=====================================
 template <typename SType>
 void Bernoulli(const SType p, Tensor *out) {
   TYPE_LANG_SWITCH(out->data_type(), DType, out->device()->lang(), Lang, {
     auto prob = TypeCast<SType, DType>(p);
     out->device()->Exec([prob, out](Context * ctx) {
       Bernoulli<DType, Lang>(prob, out, ctx);
     }, {}, {out->block()}, true);
   });
 }

 template void Bernoulli<float>(const float p, Tensor *out);

 template <typename SType>
 void Uniform(const SType low, const SType high, Tensor *out) {
   TYPE_LANG_SWITCH(out->data_type(), DType, out->device()->lang(), Lang, {
     auto l = TypeCast<SType, DType>(low);
     auto h = TypeCast<SType, DType>(high);
     out->device()->Exec([l, h, out](Context * ctx) {
       Uniform<DType, Lang>(l, h, out, ctx);
     }, {}, {out->block()}, true);
   });
 }

 template void Uniform<float>(const float low, const float high, Tensor *out);

 template <typename SType>
 void Gaussian(const SType mean, const SType std, Tensor *out) {
   TYPE_LANG_SWITCH(out->data_type(), DType, out->device()->lang(), Lang, {
     auto m = TypeCast<SType, DType>(mean);
     auto s = TypeCast<SType, DType>(std);
     out->device()->Exec([m, s, out](Context * ctx) {
       Gaussian<DType, Lang>(m, s, out, ctx);
     }, {}, {out->block()}, true);
   });
 }
 template void Gaussian<float>(const float mean, const float std, Tensor *out);

 // ================Blas operations============================================

 template <typename SType>
 void Axpy(const SType alpha, const Tensor &in, Tensor *out) {
   TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
     auto a = TypeCast<SType, DType>(alpha);
     out->device()->Exec([a, in, out](Context * ctx) {
       Axpy<DType, Lang>(a, in, out, ctx);
     }, {in.block(), out->block()}, {out->block()});
   });
 }

 template
 void Axpy<float>(const float alpha, const Tensor &in, Tensor *out);

 Tensor Mult(const Tensor &A, const Tensor &B) {
   Shape s;
   s.push_back(A.shape(0));
   if (B.nDim() == 2) s.push_back(B.shape(1));
   Tensor out(s, A.device(), A.data_type());
   Mult(A, B, &out);
   return out;
 }

 void Mult(const Tensor &A, const Tensor &B, Tensor *out) {
   Mult(1.0f, A, B, 0.0f, out);
 }

 template <typename SType>
 void Mult(const SType alpha, const Tensor &A, const Tensor &B, const SType beta,
           Tensor *C) {
   CHECK_EQ(A.shape().size(), 2u);
   if (B.nDim() == 1u) {
     TYPE_LANG_SWITCH(A.data_type(), DType, A.device()->lang(), Lang, {
       auto a = TypeCast<SType, DType>(alpha);
       auto b = TypeCast<SType, DType>(beta);
       C->device()->Exec([a, A, b, B, C](Context * ctx) {
         GEMV<DType, Lang>(a, A, B, b, C, ctx);
       }, {A.block(), B.block()}, {C->block()});
     });
   } else {
     CHECK(!C->transpose());
     TYPE_LANG_SWITCH(A.data_type(), DType, A.device()->lang(), Lang, {
       auto a = TypeCast<SType, DType>(alpha);
       auto b = TypeCast<SType, DType>(beta);
       C->device()->Exec([a, A, b, B, C](Context * ctx) {
         GEMM<DType, Lang>(a, A, B, b, C,
         ctx);
       }, {A.block(), B.block()}, {C->block()});
     });
   }
 }

 // ************************
 // Misc.
 // ************************
 Tensor CrossEntropyFwd(const Tensor& p, const Tensor& t) {
   Tensor loss({p.shape(0)}, p.device(), p.data_type());
   ComputeCrossEntropy(p, t, &loss);
   return loss;
 }

 Tensor SoftmaxCrossEntropyBwd(const Tensor& p, const Tensor& t) {
   auto g = p.Clone();
   SoftmaxCrossEntropyBwd(t, &g);
   return g;
 }

 void ComputeCrossEntropy(const Tensor &p, const Tensor &t, Tensor *loss) {
   CHECK_LE(p.nDim(), 2u);
   CHECK_LE(t.nDim(), 2u);
   size_t batchsize = 1;
   if (p.nDim() == 2u) batchsize = p.shape(0);
   size_t dim = p.Size() / batchsize;
   TYPE_LANG_SWITCH(p.data_type(), DType, p.device()->lang(), Lang, {
     p.device()->Exec([batchsize, dim, t, p, loss](Context * ctx) {
       bool int_target = t.Size() == batchsize;
       ComputeCrossEntropy<DType, Lang>(int_target, batchsize, dim, p.block(),
       t.block(), loss->block(), ctx);
     }, {p.block(), t.block()}, {loss->block()});
   });
 }

 void SoftmaxCrossEntropyBwd(const Tensor &t, Tensor *p) {
   CHECK_LE(p->nDim(), 2u);
   CHECK_LE(t.nDim(), 2u);
   size_t batchsize = 1;
   if (p->nDim() == 2u) batchsize = p->shape(0);
   size_t dim = p->Size() / batchsize;
   TYPE_LANG_SWITCH(p->data_type(), DType, p->device()->lang(), Lang, {
     p->device()->Exec([batchsize, dim, t, p](Context * ctx) {
       bool int_target = t.Size() == batchsize;
       SoftmaxCrossEntropyBwd<DType, Lang>(int_target, batchsize, dim,
       p->block(), t.block(), p->block(), ctx);
     }, {p->block(), t.block()}, {p->block()});
   });
 }


 // if tensor is not transposed yet, we change the shape and generate new stride
 // if tensor is already transposed, we reallocate the memory and generate stride
 Tensor& Tensor::Reshape(const Shape &shape) {
   // Check original volumn with the new one
   // do not use Product(shape_) due to stride 0 from broadcasting.
   CHECK_EQ(Product(shape), Size());
   if (transpose()) {
     Tensor t(shape, device_, data_type_);
     singa::Transform(*this, &t);
     shape_ = shape;
     std::swap(t.block_, block_);
   } else {
     shape_ = shape;
   }
   generate_stride();
   return *this;
 }

 Tensor Reshape(const Tensor &in, const Shape &s) {
   Tensor out(in);
   return out.Reshape(s);
 }

 }  // namespace singa