src/core/tensor/tensor.cc - singa - Git at Google

 /**
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 #include "singa/core/tensor.h"
 #include <algorithm>
 #include <utility>

 #include "./tensor_math.h"
 #include "./tensor_math_cpp.h"
 #include "./tensor_math_cuda.h"
 #include "./tensor_math_opencl.h"

 #define Noaxis 9999

 namespace singa {

 template half_float::half TypeCast(const float &x);
 template float TypeCast(const half_float::half &x);
 template int TypeCast(const float &x);
 template float TypeCast(const int &x);

 Tensor::~Tensor() {
   if (block_ != nullptr && block_->DecRefCount() == 0) {
     device_->FreeBlock(block_);
   }
   block_ = nullptr;
 }

 Tensor::Tensor() {
   device_ = defaultDevice;
   stride_ = {1};
 }

 // non-strided constructors
 Tensor::Tensor(const Shape &shape, DataType dtype)
     : data_type_(dtype), device_(defaultDevice), shape_(shape) {
   size_t size = Product(shape_) * SizeOf(data_type_);
   if (size) {
     block_ = device_->NewBlock((int)size);
   }
   generate_stride();
 }

 // non-strided constructors with device
 Tensor::Tensor(const Shape &shape, std::shared_ptr<Device> device,
                DataType dtype)
     : data_type_(dtype), device_(device), shape_(shape) {
   size_t size = Product(shape_) * SizeOf(data_type_);
   if (size) {
     block_ = device_->NewBlock((int)size);
   }
   generate_stride();
 }

 Tensor::Tensor(const Tensor &in)
     : data_type_(in.data_type_),
       device_(in.device_),
       block_(in.block()),
       shape_(in.shape_),
       stride_(in.stride_) {
   if (block_ != nullptr) block_->IncRefCount();
 }

 Tensor::Tensor(Tensor &&in)
     : data_type_(in.data_type_),
       device_(in.device_),
       shape_(std::move(in.shape_)),
       stride_(std::move(in.stride_)) {
   block_ = in.block_;
   in.block_ = nullptr;
 }

 Tensor &Tensor::ResetLike(const Tensor &in) {
   if (block_ == nullptr || device_ != in.device_ || MemSize() != in.MemSize()) {
     if (block_ != nullptr && block_->DecRefCount() == 0)
       device_->FreeBlock(block_);
     device_ = in.device_;
     data_type_ = in.data_type_;
     block_ = device_->NewBlock((int)in.MemSize());
   }
   shape_ = in.shape_;
   stride_ = in.stride_;
   return *this;
 }

 Tensor &Tensor::Resize(const Shape &shape) {
   if (Size() != Product(shape)) {
     if (block_ != nullptr && block_->DecRefCount() == 0)
       device_->FreeBlock(block_);
     block_ = device_->NewBlock((int)(Product(shape) * SizeOf(data_type_)));
   }
   shape_ = shape;
   generate_stride();
   return *this;
 }

 Tensor Resize(const Tensor &in, const Shape &shape) {
   Tensor out(in);
   out.Resize(shape);
   return out;
 }

 #define TYPE_TYPE_LANG_SWITCH(ldtype, LDType, rdtype, RDType, ltype, Lang,     \
                               ...)                                             \
   do {                                                                         \
     const int _SwitchShift = 3;                                                \
     int _SwitchHash =                                                          \
         ((ldtype) << _SwitchShift * 2) + ((rdtype) << _SwitchShift) + (ltype); \
     switch (_SwitchHash) {                                                     \
       case (((kFloat16) << _SwitchShift * 2) + (kFloat32 << _SwitchShift) +    \
             kCpp): {                                                           \
         typedef half_float::half LDType;                                       \
         typedef float RDType;                                                  \
         typedef lang::Cpp Lang;                                                \
         { __VA_ARGS__ }                                                        \
         break;                                                                 \
       }                                                                        \
       case (((kFloat32) << _SwitchShift * 2) + (kFloat16 << _SwitchShift) +    \
             kCpp): {                                                           \
         typedef float LDType;                                                  \
         typedef half_float::half RDType;                                       \
         typedef lang::Cpp Lang;                                                \
         { __VA_ARGS__ }                                                        \
         break;                                                                 \
       }                                                                        \
       case (((kFloat16) << _SwitchShift * 2) + (kFloat32 << _SwitchShift) +    \
             kCuda): {                                                          \
         typedef half_float::half LDType;                                       \
         typedef float RDType;                                                  \
         typedef lang::Cuda Lang;                                               \
         { __VA_ARGS__ }                                                        \
         break;                                                                 \
       }                                                                        \
       case (((kFloat32) << _SwitchShift * 2) + (kFloat16 << _SwitchShift) +    \
             kCuda): {                                                          \
         typedef float LDType;                                                  \
         typedef half_float::half RDType;                                       \
         typedef lang::Cuda Lang;                                               \
         { __VA_ARGS__ }                                                        \
         break;                                                                 \
       }                                                                        \
       case (((kFloat32) << _SwitchShift * 2) + (kInt << _SwitchShift) +        \
             kCuda): {                                                          \
         typedef float LDType;                                                  \
         typedef int RDType;                                                    \
         typedef lang::Cuda Lang;                                               \
         { __VA_ARGS__ }                                                        \
         break;                                                                 \
       }                                                                        \
       case (((kInt) << _SwitchShift * 2) + (kFloat32 << _SwitchShift) +        \
             kCuda): {                                                          \
         typedef int LDType;                                                    \
         typedef float RDType;                                                  \
         typedef lang::Cuda Lang;                                               \
         { __VA_ARGS__ }                                                        \
         break;                                                                 \
       }                                                                        \
       case (((kFloat32) << _SwitchShift * 2) + (kInt << _SwitchShift) +        \
             kCpp): {                                                           \
         typedef float LDType;                                                  \
         typedef int RDType;                                                    \
         typedef lang::Cpp Lang;                                                \
         { __VA_ARGS__ }                                                        \
         break;                                                                 \
       }                                                                        \
       case (((kInt) << _SwitchShift * 2) + (kFloat32 << _SwitchShift) +        \
             kCpp): {                                                           \
         typedef int LDType;                                                    \
         typedef float RDType;                                                  \
         typedef lang::Cpp Lang;                                                \
         { __VA_ARGS__ }                                                        \
         break;                                                                 \
       }                                                                        \
       default:                                                                 \
         LOG(FATAL) << "Unknown combination of left data type "                 \
                    << DataType_Name(ldtype) << " and right data type "         \
                    << DataType_Name(rdtype) << " and language "                \
                    << LangType_Name(ltype);                                    \
     }                                                                          \
   } while (0)

 // return new tensor
 Tensor Tensor::AsType(const DataType type) const {
   if (data_type_ != type) {
     const Tensor &thisRef = *this;
     Tensor ret(shape_, device_, type);
     TYPE_TYPE_LANG_SWITCH(
         data_type_, LDType, type, RDType, device_->lang(), Lang, {
           ret.device()->Exec(
               [thisRef, ret](Context *ctx) mutable {
                 CastCopy<LDType, RDType, Lang>(&thisRef, &ret, ctx);
               },
               {this->block()}, {ret.block()}, "AsType");
         });
     return ret;
   } else {
     Tensor t = this->Clone();
     return t;
   }
 }

 Tensor &Tensor::ToType(const DataType type) {
   CHECK(block() && block()->initialized() == true)
       << "the data of the tensor needs be initialized before casting to "
          "another type";
   if (data_type_ != type) {
     auto ret = this->AsType(type);
     std::swap(ret.block_, block_);
     data_type_ = type;
   }
   return *this;
 }

 Tensor &Tensor::ToDevice(std::shared_ptr<Device> dst) {
   // TODO(wangwei) the comparison is restricted. May compare against device ID?
   if (device_ != dst) {
     // WARNING: this function can't be buffered
     Tensor tmp(shape_, dst, data_type_);
     if (block_ != nullptr && Size() && block_->initialized())
       tmp.CopyData(*this);
     if (block_ != nullptr && block_->DecRefCount() == 0)
       device_->FreeBlock(block_);
     block_ = tmp.block_;
     tmp.block_ = nullptr;
     device_ = dst;
   }
   return *this;
 }

 Tensor &Tensor::ToHost() {
   if (device_ != defaultDevice) ToDevice(device_->host());
   return *this;
 }

 template <typename DType>
 void Tensor::CopyDataFromHostPtr(const DType *src, const size_t num,
                                  const size_t offset) const {
   CHECK_EQ(sizeof(DType), SizeOf(data_type_))
       << "data_type is " << DataType_Name(data_type_)
       << " user given type is of size " << sizeof(DType);
   if (src != nullptr) {
     Device *dev = device_.get();
     const Tensor &thisRef = *this;
     size_t nBytes = sizeof(DType) * num;
     size_t dst_offset = sizeof(DType) * offset;
     device_->Exec(
         [dev, thisRef, src, nBytes, dst_offset](Context *ctx) mutable {
           dev->CopyDataFromHostPtr(thisRef.block(), src, nBytes, dst_offset,
                                    ctx);
         },
         {}, {block()}, "CopyDataFromHostPtr");
   } else {
     LOG(WARNING) << "Copy data from null host ptr";
   }
 }
 template void Tensor::CopyDataFromHostPtr(const unsigned char *src,
                                           const size_t num,
                                           const size_t offset) const;
 template void Tensor::CopyDataFromHostPtr(const half_float::half *src,
                                           const size_t num,
                                           const size_t offset) const;
 template void Tensor::CopyDataFromHostPtr(const float *src, const size_t num,
                                           const size_t offset) const;
 template void Tensor::CopyDataFromHostPtr(const int *src, const size_t num,
                                           const size_t offset) const;

 void Tensor::CopyData(const Tensor &src) {
   CHECK_EQ(Size(), src.Size());
   CHECK(block_ != nullptr);
   CHECK_EQ(src.data_type(), data_type_)
     << "Could not copy data between different data type";
   // Do copy only if the src's block is already initialized.
   if (src.block_ != nullptr) {
     singa::CopyDataToFrom(this, src, Size(), 0, 0);
   }
 }

 void Tensor::RepeatData(const vector<size_t> &repeats, int axis,
                         int total_repeats, const Tensor &src) {
   if (repeats.size() == 1) {
     CHECK_EQ(Size(), src.Size() * total_repeats);
   } else {
     CHECK_EQ(Size(), src.Size() * total_repeats / src.shape()[axis]);
   }

   CHECK(block_ != nullptr);
   // Do repeat only if the src's block is already initialized.
   if (src.block_ != nullptr) {
     singa::RepeatDataToFrom(false, repeats, axis, this, src, Size());
   }
 }

 void Tensor::FromProto(const singa::TensorProto &proto) {
   if (block_ != nullptr && block_->DecRefCount() == 0)
     device_->FreeBlock(block_);
   block_ = nullptr;
   for (uint32_t s : proto.shape()) shape_.push_back(s);
   data_type_ = proto.data_type();
   block_ = device_->NewBlock((int)(Product(shape()) * SizeOf(data_type_)));
   // transpose_ = proto.transpose();
   stride_.clear();
   for (int32_t s : proto.stride()) stride_.push_back(s);
   switch (data_type_) {
     case kFloat32: {
       std::unique_ptr<float[]> data_ptr(new float[Product(shape_)]);
       for (size_t i = 0; i < Product(shape_); ++i)
         data_ptr[i] = static_cast<float>(proto.float_data((int)i));
       CopyDataFromHostPtr<float>(data_ptr.get(), Product(shape_));
       break;
     }
     case kDouble: {
       std::unique_ptr<double[]> data(new double[Product(shape_)]);
       for (size_t i = 0; i < Product(shape_); ++i)
         data[i] = proto.double_data((int)i);
       CopyDataFromHostPtr<double>(data.get(), Product(shape_));
       break;
     }
     case kInt: {
       std::unique_ptr<int[]> data(new int[Product(shape_)]);
       for (size_t i = 0; i < Product(shape_); ++i)
         data[i] = proto.int_data((int)i);
       CopyDataFromHostPtr<int>(data.get(), Product(shape_));
       break;
     }
     /// TODO(wangji): Implement to support C++ type char using bytes type in
     /// protobuf
     /// which is equivalent to string type is different from the other cases.
     /// The kchar
     /// and kUChar case is to be implemented.
     /*
     case kChar: {
       std::unique_ptr<char[]> data(new char[Product(shape_)]);
       for (size_t i = 0; i < Product(shape_); ++i)
         data[i] = static_cast<char>(proto.bytes_data(i));
       break;
     }
     case kUChar: {
       std::unique_ptr<unsigned char[]> data(new unsigned char[Product(shape_)]);
       for (size_t i = 0; i < Product(shape_); ++i)
         data[i] = static_cast<unsigned char>(proto.bytes_data(i));
       break;
     }
     */
     default: {
       LOG(FATAL) << "Unsupported Type" << DataType_Name(data_type_);
     }
   }
 }

 void Tensor::to_proto(singa::TensorProto *proto) const {
   proto->clear_shape();
   for (auto s : shape_) {
     proto->add_shape(s);
   }
   proto->set_data_type(data_type_);
   // proto->set_transpose(transpose_);
   proto->clear_stride();
   for (auto s : stride_) {
     proto->add_stride(s);
   }
   switch (data_type_) {
     case kFloat32: {
       proto->clear_float_data();
       const float *data_ptr = data<float>();
       for (size_t i = 0; i < Product(shape_); ++i)
         proto->add_float_data(data_ptr[i]);
       break;
     }
     case kDouble: {
       proto->clear_double_data();
       const double *data_ptr = data<double>();
       for (size_t i = 0; i < Product(shape_); ++i)
         proto->add_double_data(data_ptr[i]);
       break;
     }
     case kInt: {
       proto->clear_int_data();
       const int *data_ptr = data<int>();
       for (size_t i = 0; i < Product(shape_); ++i)
         proto->add_int_data(data_ptr[i]);
       break;
     }
     /*
     case kChar: {
       proto->clear_bytes_data();
       const char *data = data<char>();
       for (size_t i = 0; i < Product(shape_); ++i)
         proto->add_bytes_data(static_cast<unsigned char>(data[i]));
       break;
     }
     case kUChar: {
       proto->clear_bytes_data();
       const unsigned char *data = data<unsigned char>();
       for (size_t i = 0; i < Product(shape_); ++i)
         proto->add_bytes_data(static_cast<unsigned char>(data[i]));
       break;
     }
     */
     default: {
       LOG(FATAL) << "Unsupported Type" << DataType_Name(data_type_);
     }
   }
 }

 void Tensor::ToProto(singa::TensorProto *proto) const { to_proto(proto); }

 Tensor Tensor::Repeat(const vector<size_t> &repeats, int axis,
                       std::shared_ptr<Device> device) {
   if (device == nullptr) device = device_;
   vector<size_t> tshape;
   int total_repeats = 0;
   if (axis == Noaxis) {
     total_repeats = repeats[0];
     tshape.push_back(Product(shape_) * total_repeats);
   } else {
     if (repeats.size() == 1) {
       total_repeats = repeats[0];
       for (int i = 0; i < static_cast<int>(shape_.size()); i++) {
         if (i == axis) {
           tshape.push_back(shape_[i] * total_repeats);
         } else {
           tshape.push_back(shape_[i]);
         }
       }
     } else {
       if (repeats.size() != shape_[axis]) {
         LOG(FATAL) << "the repeats number doesn't match the axis";
       }
       for (size_t i = 0; i < shape_[axis]; i++) {
         if (repeats[i] < 0) {
           LOG(FATAL) << "the repeats number is less than zero";
         }
         total_repeats += repeats[i];
       }
       for (int i = 0; i < static_cast<int>(shape_.size()); i++) {
         if (i == axis) {
           tshape.push_back(total_repeats);
         } else {
           tshape.push_back(shape_[i]);
         }
       }
     }
   }
   Tensor t(tshape, device_);
   // t.stride_.push_back(1);
   t.RepeatData(repeats, axis, total_repeats, *this);
   return t;
 }

 Tensor Tensor::Clone(std::shared_ptr<Device> device) const {
   if (device == nullptr) device = device_;
   Tensor t(shape_, device, data_type_);
   // t.transpose_ = transpose_;
   t.stride_ = stride_;
   t.CopyData(*this);
   return t;
 }

 void Tensor::Clone(Tensor *&other, std::shared_ptr<Device> device) const {
   if (device == nullptr) device = device_;
   other = new Tensor(shape_, device, data_type_);
   other->stride_ = stride_;
   other->CopyData(*this);
   return;
 }

 Tensor &Tensor::Broadcast(const Shape &shape, const int ignore_last_dim) {
   // TODO(wangwei) do we need to transform the mem layout if the tensor was
   // transposed?
   auto m = shape_.size() - 1, n = shape.size() - 1;
   // ignore_last_dim is useful for mult broadcast
   // e.g. (2,3,4)x(4,5) to (2,3,4)x(2,4,5)
   if (ignore_last_dim < std::min(m, n) + 1) {
     for (size_t i = ignore_last_dim; i <= std::min(m, n); i++) {
       if ((shape.at(n - i) != shape_.at(m - i)) && (shape.at(n - i) != 1)) {
         CHECK_EQ(shape_.at(m - i), 1) << "i= " << i << "\n";  // << Backtrace();
         shape_.at(m - i) = shape.at(n - i);
         stride_.at(m - i) = 0;
       }
     }
   }
   if (m < n) {
     for (size_t i = m + 1; i <= n; i++) {
       shape_.emplace(shape_.begin(), shape.at(n - i));
       stride_.emplace(stride_.begin(), 0);
     }
   }
   return *this;
 }

 Tensor Broadcast(const Tensor &in, const Shape &shape,
                  const int ignore_last_dim) {
   Tensor out(in);
   return out.Broadcast(shape, ignore_last_dim);
 }

 Tensor &Tensor::T() {
   // this function only works for 2d tensors
   CHECK_EQ(shape_.size(), 2u);
   Transpose();
   return *this;
 }

 // normal transpose without axes
 Tensor &Tensor::Transpose() {
   std::reverse(shape_.begin(), shape_.end());
   std::reverse(stride_.begin(), stride_.end());
   return *this;
 }

 // transpose with axes
 Tensor &Tensor::Transpose(const vector<size_t> &axes) {
   CHECK_EQ(axes.size(), shape_.size())
       << "Tranpose axes's length should be equal to shape";

   auto shape = shape_;
   auto stride = stride_;
   shape_.clear();
   stride_.clear();
   for (size_t n = 0; n < axes.size(); ++n) {
     shape_.push_back(shape[axes[n]]);
     stride_.push_back(stride[axes[n]]);
   }
   return *this;
 }

 // normal transpose without axes
 Tensor Transpose(const Tensor &in) {
   Tensor out(in);
   out.Transpose();
   return out;
 }

 // transpose with axes
 Tensor Transpose(const Tensor &in, const vector<size_t> &axes) {
   Tensor out(in);
   out.Transpose(axes);
   return out;
 }

 Tensor &Tensor::operator=(const Tensor &in) {
   if (block_ != nullptr && block_->DecRefCount() == 0)
     device_->FreeBlock(block_);
   stride_ = in.stride_;
   data_type_ = in.data_type_;
   shape_ = in.shape_;
   device_ = in.device_;
   block_ = in.block();
   if (block_ != nullptr) block_->IncRefCount();
   return *this;
 }

 Tensor &Tensor::operator=(Tensor &&in) {
   if (block_ != nullptr && block_->DecRefCount() == 0)
     device_->FreeBlock(block_);
   stride_ = std::move(in.stride_);
   data_type_ = in.data_type_;
   shape_ = std::move(in.shape_);
   device_ = in.device_;
   block_ = in.block_;
   in.block_ = nullptr;
   return *this;
 }

 #define GenUnaryTensorArgMemberFn(op, fn) \
   Tensor &Tensor::op(const Tensor &in) {  \
     Tensor out(*this);                    \
     fn(*this, in, &out);                  \
     return *this;                         \
   }

 GenUnaryTensorArgMemberFn(operator+=, Add);
 GenUnaryTensorArgMemberFn(operator-=, Sub);
 GenUnaryTensorArgMemberFn(operator*=, EltwiseMult);
 GenUnaryTensorArgMemberFn(operator/=, Div);

 #define GenUnaryScalarArgMemberFn(op, fn) \
   template <typename DType>               \
   Tensor &Tensor::op(const DType x) {     \
     Tensor out(*this);                    \
     fn(*this, x, &out);                   \
     return *this;                         \
   }                                       \
   template Tensor &Tensor::op<float>(const float x)

 GenUnaryScalarArgMemberFn(operator-=, Sub);
 GenUnaryScalarArgMemberFn(operator+=, Add);
 GenUnaryScalarArgMemberFn(operator*=, EltwiseMult);
 GenUnaryScalarArgMemberFn(operator/=, Div);

 // ====================Tensor Operations=======================================
 void CopyDataToFrom(Tensor *dst, const Tensor &src, const size_t num,
                     const size_t dst_offset, const size_t src_offset) {
   auto width = SizeOf(src.data_type());
   CHECK_EQ(width, SizeOf(dst->data_type()));
   size_t nBytes = num * width;
   auto d_offset = dst_offset * width;
   auto s_offset = src_offset * width;
   CHECK_GE(src.MemSize(), s_offset + nBytes);
   CHECK_GE(dst->MemSize(), d_offset + nBytes);

   Device *dev = nullptr;
   CopyDirection direct;
   std::shared_ptr<Device> src_dev = src.device(), dst_dev = dst->device();
   if (dst_dev->lang() != src_dev->lang()) {
     // let the none cpp device conduct copy op
     if (dst_dev->lang() == kCpp) {
       dev = src_dev.get();
       direct = kDeviceToHost;
     } else if (src_dev->lang() == kCpp) {
       dev = dst_dev.get();
       direct = kHostToDevice;
     } else {
       LOG(FATAL) << "Not support mem copy between Cuda and OpenCL device";
     }
   } else {
     dev = src_dev.get();
     direct = src_dev->lang() == kCpp ? kHostToHost : kDeviceToDevice;
   }

   Tensor &dstRef = *dst;
   dev->Exec(
       [dev, dstRef, src, nBytes, direct, d_offset,
        s_offset](Context *ctx) mutable {
         Block *from = src.block(), *to = dstRef.block();
         dev->CopyDataToFrom(to, from, nBytes, direct, (int)d_offset,
                             (int)s_offset, ctx);
       },
       {src.block()}, {dst->block()}, "CopyDataToFrom");
 }

 void RepeatDataToFrom(bool broadcast_flag, const vector<size_t> &repeats,
                       int axis, Tensor *dst, const Tensor &src,
                       const size_t num) {
   if (repeats.size() == 1) {
     broadcast_flag = true;
   } else if (repeats.size() > 1) {
     if (axis == Noaxis) {
       LOG(FATAL) << "When repeats parameter is sequence, axis cannot be None";
     }
   }
   for (size_t i = 0; i < repeats.size(); i++) {
     CHECK_GE(repeats[i], 0);
   }
   auto width = SizeOf(src.data_type());
   CHECK_EQ(width, SizeOf(dst->data_type()));
   // size_t nBytes = num * width;
   int chunk = width;
   int axis_shape = 1;
   int shape_outer = 1;
   if (axis == Noaxis) {
     axis_shape = 1;
     shape_outer = Product(src.shape());
   } else {
     for (int i = 0; i < axis; i++) {
       shape_outer *= src.shape()[i];
     }
     axis_shape = src.shape()[axis];
     for (int i = axis + 1; i < static_cast<int>(src.nDim()); i++) {
       chunk *= src.shape()[i];
     }
   }

   Device *dev = nullptr;
   CopyDirection direct;
   std::shared_ptr<Device> src_dev = src.device(), dst_dev = dst->device();
   if (dst_dev->lang() != src_dev->lang()) {
     // let the none cpp device conduct copy op
     if (dst_dev->lang() == kCpp) {
       dev = src_dev.get();
       direct = kDeviceToHost;
     } else if (src_dev->lang() == kCpp) {
       dev = dst_dev.get();
       direct = kHostToDevice;
     } else {
       LOG(FATAL)
           << "Not support mem repeat copy between Cuda and OpenCL device";
     }
   } else {
     dev = src_dev.get();
     direct = src_dev->lang() == kCpp ? kHostToHost : kDeviceToDevice;
   }

   int dst_offset = 0;
   int src_offset = 0;
   Tensor &dstRef = *dst;
   for (int i = 0; i < shape_outer; i++) {
     for (int j = 0; j < axis_shape; j++) {
       int temp = broadcast_flag ? repeats[0] : repeats[j];
       for (int k = 0; k < temp; k++) {
         dev->Exec(
             [dev, dstRef, src, chunk, direct, dst_offset,
              src_offset](Context *ctx) mutable {
               Block *from = src.block(), *to = dstRef.block();
               dev->CopyDataToFrom(to, from, chunk, direct, dst_offset,
                                   src_offset, ctx);
             },
             {src.block()}, {dst->block()}, "CopyDataToFrom");
         dst_offset += chunk;
       }
       src_offset += chunk;
     }
   }
 }

 //============================================================================
 /// typedef DType accroding to type value.
 /// DType would be used in the code block __VA_ARGS__.
 #define TYPE_SWITCH(type, DType, ...)                               \
   do {                                                              \
     switch (type) {                                                 \
       case kFloat16: {                                              \
         typedef half_float::half DType;                             \
         { __VA_ARGS__ }                                             \
         break;                                                      \
       }                                                             \
       case kFloat32: {                                              \
         typedef float DType;                                        \
         { __VA_ARGS__ }                                             \
         break;                                                      \
       }                                                             \
       case kInt: {                                                  \
         typedef int DType;                                          \
         { __VA_ARGS__ }                                             \
         break;                                                      \
       }                                                             \
       case kChar: {                                                 \
         typedef char DType;                                         \
         { __VA_ARGS__ }                                             \
         break;                                                      \
       }                                                             \
       case kDouble: {                                               \
         typedef double DType;                                       \
         { __VA_ARGS__ }                                             \
         break;                                                      \
       }                                                             \
       default:                                                      \
         LOG(FATAL) << "Unknow data type = " << DataType_Name(type); \
     }                                                               \
   } while (0)

 /// typedef DType and Lang according to data type and device programming
 /// language respectively.
 /// type is from DataType, and lang is from LangType.
 /// DType and Lang would be used in __VA_ARGS__.
 #define TYPE_LANG_SWITCH(dtype, DType, ltype, Lang, ...)       \
   do {                                                         \
     const int _SwitchShift = 3;                                \
     int _SwitchHash = ((dtype) << _SwitchShift) + (ltype);     \
     switch (_SwitchHash) {                                     \
       case ((kFloat16 << _SwitchShift) + kCpp): {              \
         typedef half_float::half DType;                        \
         typedef lang::Cpp Lang;                                \
         { __VA_ARGS__ }                                        \
         break;                                                 \
       }                                                        \
       case ((kFloat16 << _SwitchShift) + kCuda): {             \
         typedef half_float::half DType;                        \
         typedef lang::Cuda Lang;                               \
         { __VA_ARGS__ }                                        \
         break;                                                 \
       }                                                        \
       case ((kFloat32 << _SwitchShift) + kCuda): {             \
         typedef float DType;                                   \
         typedef lang::Cuda Lang;                               \
         { __VA_ARGS__ }                                        \
         break;                                                 \
       }                                                        \
       case ((kInt << _SwitchShift) + kCuda): {                 \
         typedef int DType;                                     \
         typedef lang::Cuda Lang;                               \
         { __VA_ARGS__ }                                        \
         break;                                                 \
       }                                                        \
       case ((kFloat32 << _SwitchShift) + kCpp): {              \
         typedef float DType;                                   \
         typedef lang::Cpp Lang;                                \
         { __VA_ARGS__ }                                        \
         break;                                                 \
       }                                                        \
       case ((kInt << _SwitchShift) + kCpp): {                  \
         typedef int DType;                                     \
         typedef lang::Cpp Lang;                                \
         { __VA_ARGS__ }                                        \
         break;                                                 \
       }                                                        \
       case ((kFloat32 << _SwitchShift) + kOpencl): {           \
         typedef float DType;                                   \
         typedef lang::Opencl Lang;                             \
         { __VA_ARGS__ }                                        \
         break;                                                 \
       }                                                        \
       default:                                                 \
         LOG(FATAL) << "Unknown combination of data type "      \
                    << DataType_Name(dtype) << " and language " \
                    << LangType_Name(ltype);                    \
     }                                                          \
   } while (0)

 // =============Element-wise operations====================================
 float Tensor::l1() const {
   float nrm = 0.0f;
   TYPE_LANG_SWITCH(data_type_, DType, device_->lang(), Lang, {
     device_->Exec(
         [&nrm, this](Context *ctx) {
           DType ret = DType(0);
           Asum<DType, Lang>(*this, &ret, ctx);
           nrm = TypeCast<DType, float>(ret);
         },
         {this->block()}, {}, "l1");
   });
   return nrm / Size();
 }

 // DEPRECATED use l1()
 float Tensor::L1() const { return l1(); }

 /// L2 norm, Do not use Nrm2 (name conflict).
 float Tensor::l2() const {
   float nrm = 0.0f;
   TYPE_LANG_SWITCH(data_type_, DType, device_->lang(), Lang, {
     device_->Exec(
         [&nrm, this](Context *ctx) {
           Nrm2<DType, Lang>(*this, &nrm, ctx);
         },
         {this->block()}, {}, "L1");
   });
   return nrm / Size();
 }

 // DEPRECATED use l2()
 float Tensor::L2() const { return l2(); }

 template <typename SType>
 void Tensor::SetValue(const SType x) {
   // auto size = Size();
   auto ptr = block_;

   TYPE_LANG_SWITCH(data_type_, DType, device_->lang(), Lang, {
     DType tmp = TypeCast<SType, DType>(x);
     Tensor &thisRef = *this;
     device_->Exec(
         [thisRef, tmp](Context *ctx) mutable {
           Set<DType, Lang>(tmp, &thisRef, ctx);
         },
         {}, {ptr}, "SetValue");
   });
 }
 template void Tensor::SetValue<float>(const float x);
 template void Tensor::SetValue<half_float::half>(const half_float::half x);
 template void Tensor::SetValue<int>(const int x);

 template <typename SType>
 void Tensor::get_value(SType *value, const size_t num) const {
   CHECK(device_ == defaultDevice);
   Tensor t(shape_, device_, data_type_);
   // transform function arrange data in memory considering stride
   singa::Transform(*this, &t);
   auto ptr = static_cast<const SType *>(t.block()->data());
   for (size_t i = 0; i < num; i++) value[i] = ptr[i];
 }
 template void Tensor::get_value<float>(float *value, const size_t num) const;
 template void Tensor::get_value<half_float::half>(half_float::half *value,
                                                   const size_t num) const;
 template void Tensor::get_value<int>(int *value, const size_t num) const;

 // DEPRECATED
 template <typename SType>
 void Tensor::GetValue(SType *value, const size_t num) const {
   get_value(value, num);
 }
 template void Tensor::GetValue<float>(float *value, const size_t num) const;
 template void Tensor::GetValue<int>(int *value, const size_t num) const;

 #define EltwiseUnaryTensorFn(fn, t, ret)                               \
   do {                                                                 \
     TYPE_LANG_SWITCH(t.data_type(), DType, t.device()->lang(), Lang, { \
       Tensor &retRef = *ret;                                           \
       ret->device()->Exec(                                             \
           [t, retRef](Context *ctx) mutable {                          \
             fn<DType, Lang>(t, &retRef, ctx);                          \
           },                                                           \
           {t.block()}, {ret->block()}, #fn);                           \
     });                                                                \
   } while (0)

 #define GenUnaryTensorFn(fn)                             \
   Tensor fn(const Tensor &in) {                          \
     Tensor ret(in.shape(), in.device(), in.data_type()); \
     Tensor *retptr = &ret;                               \
     EltwiseUnaryTensorFn(fn, in, retptr);                \
     return ret;                                          \
   }                                                      \
   void fn(const Tensor &in, Tensor *out) { EltwiseUnaryTensorFn(fn, in, out); }

 GenUnaryTensorFn(Abs);
 GenUnaryTensorFn(Erf);
 GenUnaryTensorFn(Ceil);
 GenUnaryTensorFn(Floor);
 GenUnaryTensorFn(Round);
 GenUnaryTensorFn(RoundE);
 GenUnaryTensorFn(Exp);
 GenUnaryTensorFn(Log);
 GenUnaryTensorFn(ReLU);
 GenUnaryTensorFn(Sigmoid);
 GenUnaryTensorFn(SoftPlus);
 GenUnaryTensorFn(SoftSign);
 GenUnaryTensorFn(Sign);
 GenUnaryTensorFn(Sqrt);
 GenUnaryTensorFn(Square);
 GenUnaryTensorFn(Transform);
 GenUnaryTensorFn(Cos);
 GenUnaryTensorFn(Cosh);
 GenUnaryTensorFn(Acos);
 GenUnaryTensorFn(Acosh);
 GenUnaryTensorFn(Sin);
 GenUnaryTensorFn(Sinh);
 GenUnaryTensorFn(Asin);
 GenUnaryTensorFn(Asinh);
 GenUnaryTensorFn(Tan);
 GenUnaryTensorFn(Tanh);
 GenUnaryTensorFn(Atan);
 GenUnaryTensorFn(Atanh);
 GenUnaryTensorFn(SoftMax);

 // add axis to softmax API according to ONNX specification
 // https://github.com/onnx/onnx/blob/master/docs/Operators.md#Softmax
 void SoftMax(const Tensor &in, Tensor *out, int axis) {
   // {a_0, a_1, ..., a_k-1, a_k, ... a_n-1}
   // reshape to
   // { a_0 * a_1 * ... a_k-1, a_k * ... a_n-1 }

   // assert axis \in {-r, r-1}
   CHECK_LE(axis, (int)in.shape().size() - 1);
   CHECK_GE(axis, -1 * (int)in.nDim());

   Shape original_shape = in.shape();
   if (axis < 0) axis = in.shape().size() + axis;

   Shape coerced_shape = {1, 1};
   for (std::size_t i = 0, max = in.shape().size(); i != max; ++i) {
     if (i < axis)
       coerced_shape[0] *= in.shape()[i];
     else
       coerced_shape[1] *= in.shape()[i];
   }
   Tensor in_reshaped = Reshape(in, coerced_shape);
   out->Reshape(coerced_shape);

   // optimise by minus x - x.max()
   auto in_max = RowMax(in_reshaped);
   in_max.Reshape({coerced_shape[0], 1});
   in_reshaped = in_reshaped - in_max;

   SoftMax(in_reshaped, out);

   out->Reshape(original_shape);
 }

 Tensor SoftMax(const Tensor &in, int axis) {
   Tensor ret(in.shape(), in.device(), in.data_type());
   auto *retptr = &ret;
   SoftMax(in, retptr, axis);
   return ret;
 }
 void SoftMaxBackward(const Tensor &in, Tensor *out, int axis,
                      const Tensor &fdout) {
   // {a_0, a_1, ..., a_k-1, a_k, ... a_n-1}
   // reshape to
   // { a_0 * a_1 * ... a_k-1, a_k * ... a_n-1 }

   // assert axis \in {-r, r-1}
   CHECK_LE(axis, (int)in.shape().size() - 1);
   CHECK_GE(axis, -1 * (int)in.nDim());

   Shape original_shape = in.shape();
   if (axis < 0) axis = in.shape().size() + axis;

   Shape coerced_shape = {1, 1};
   for (std::size_t i = 0, max = in.shape().size(); i != max; ++i) {
     if (i < axis)
       coerced_shape[0] *= in.shape()[i];
     else
       coerced_shape[1] *= in.shape()[i];
   }

   Tensor in_reshaped = Reshape(in, coerced_shape);
   out->Reshape(coerced_shape);

   do {
     TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
       Tensor &outRef = *out;
       out->device()->Exec(
           [in, outRef, fdout](Context *ctx) mutable {
             SoftMaxBackward<DType, Lang>(in, &outRef, fdout, ctx);
           },
           {in.block(), fdout.block()}, {out->block()}, "SoftmaxBackward");
     });
   } while (0);

   out->Reshape(original_shape);
 }

 Tensor SoftMaxBackward(const Tensor &in, int axis, const Tensor &fdout) {
   Tensor ret(in.shape(), in.device(), in.data_type());
   auto *retptr = &ret;
   SoftMaxBackward(in, retptr, axis, fdout);
   return ret;
 }

 #define EltwiseBinaryTensorFn(fn, lhs, rhs, ret)                           \
   do {                                                                     \
     TYPE_LANG_SWITCH(lhs.data_type(), DType, lhs.device()->lang(), Lang, { \
       CHECK_EQ(sizeof(DType), SizeOf(rhs.data_type()))                     \
           << "lhs dtype size" << sizeof(DType) << " rhs dtype size"        \
           << SizeOf(rhs.data_type());                                      \
       Tensor &retRef = *ret;                                               \
       ret->device()->Exec(                                                 \
           [lhs, rhs, retRef](Context *ctx) mutable {                       \
             fn<DType, Lang>(lhs, rhs, &retRef, ctx);                       \
           },                                                               \
           {lhs.block(), rhs.block()}, {ret->block()}, #fn);                \
     });                                                                    \
   } while (0)

 #define GenBinaryTensorFn(op, fn)                              \
   Tensor op(const Tensor &lhs, const Tensor &rhs) {            \
     if (lhs.shape() != rhs.shape()) {                          \
       auto lhs_ = Broadcast(lhs, rhs.shape());                 \
       auto rhs_ = Broadcast(rhs, lhs.shape());                 \
       Tensor ret(lhs_.shape(), lhs.device(), lhs.data_type()); \
       fn(lhs_, rhs_, &ret);                                    \
       return ret;                                              \
     } else {                                                   \
       Tensor ret(lhs.shape(), lhs.device(), lhs.data_type());  \
       fn(lhs, rhs, &ret);                                      \
       return ret;                                              \
     }                                                          \
   }                                                            \
   void fn(const Tensor &lhs, const Tensor &rhs, Tensor *ret) { \
     CHECK_EQ(lhs.device(), ret->device());                     \
     CHECK_EQ(rhs.device(), ret->device());                     \
     if (lhs.shape() != rhs.shape()) {                          \
       auto lhs_ = Broadcast(lhs, rhs.shape());                 \
       auto rhs_ = Broadcast(rhs, lhs.shape());                 \
       CHECK(lhs_.shape() == ret->shape());                     \
       EltwiseBinaryTensorFn(fn, lhs_, rhs_, ret);              \
     } else {                                                   \
       CHECK(lhs.shape() == ret->shape());                      \
       EltwiseBinaryTensorFn(fn, lhs, rhs, ret);                \
     }                                                          \
   }  // namespace singa

 // boradcasting operations:
 // https://github.com/onnx/onnx/blob/master/docs/Broadcasting.md
 GenBinaryTensorFn(operator+, Add);
 GenBinaryTensorFn(operator-, Sub);
 GenBinaryTensorFn(operator*, EltwiseMult);
 GenBinaryTensorFn(operator/, Div);
 GenBinaryTensorFn(Pow, Pow);
 GenBinaryTensorFn(operator<, LT);
 GenBinaryTensorFn(operator<=, LE);
 GenBinaryTensorFn(operator>, GT);
 GenBinaryTensorFn(operator>=, GE);
 GenBinaryTensorFn(operator==, EQ);
 GenBinaryTensorFn(ReLUBackward, ReLUBackward);

 #define EltwiseTensorScalarFn(fn, t, x, ret)                            \
   do {                                                                  \
     TYPE_LANG_SWITCH(t.data_type(), DType, t.device()->lang(), Lang, {  \
       DType tmp_x = TypeCast<SType, DType>(x);                         \
       Tensor &retRef = *ret;                                            \
       ret->device()->Exec(                                              \
           [t, tmp_x, retRef](Context *ctx) mutable {                   \
             fn<DType, Lang>(t, tmp_x, &retRef, ctx);                   \
           },                                                            \
           {t.block()}, {ret->block()}, #fn);                            \
     });                                                                 \
   } while (0)

 #define GenTensorScalarFn(op, fn)                                          \
   template <typename SType>                                   \
   Tensor op(const Tensor &in, const SType x) {                \
     Tensor ret(in.shape(), in.device(), in.data_type());      \
     fn(in, x, &ret);                                          \
     return ret;                                               \
   }                                                                         \
   template <typename SType>                                                \
   void fn(const Tensor &in, const SType x, Tensor *ret) {                  \
     EltwiseTensorScalarFn(fn, in, x, ret);                                 \
   }                                                                        \
   template Tensor op<float>(const Tensor &in, const float x);              \
   template void fn<float>(const Tensor &in, const float x, Tensor *ret)

 GenTensorScalarFn(operator+, Add);
 GenTensorScalarFn(operator-, Sub);
 GenTensorScalarFn(operator*, EltwiseMult);
 GenTensorScalarFn(operator/, Div);
 GenTensorScalarFn(Pow, Pow);
 GenTensorScalarFn(operator<, LT);
 GenTensorScalarFn(operator<=, LE);
 GenTensorScalarFn(operator>, GT);
 GenTensorScalarFn(operator>=, GE);
 GenTensorScalarFn(operator==, EQ);

 template <typename SType>
 Tensor Div(const SType alpha, const Tensor &in) {
   Tensor out(in.shape(), in.device(), in.data_type());
   Div(alpha, in, &out);
   return out;
 }
 template Tensor Div<float>(const float, const Tensor &);

 template <typename SType>
 void Div(const SType alpha, const Tensor &in, Tensor *out) {
   CheckDataTypeAndLang(in, *out);
   CHECK(in.shape() == out->shape());
   TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
     DType tmp_alpha = TypeCast<SType, DType>(alpha);
     Tensor &outRef = *out;
     in.device()->Exec(
         [tmp_alpha, in, outRef](Context *ctx) mutable {
           Div<DType, Lang>(tmp_alpha, in, &outRef, ctx);
         },
         {in.block()}, {out->block()}, "Div");
   });
 }
 template void Div<float>(const float, const Tensor &, Tensor *);

 // =============Matrix operations============================================
 Tensor Average(const Tensor &M, int axis) {
   // operator/ only has implementation for float scalar type, hence it is
   // necessary to cast the denominator to a float.
   // TODO(wangwei) implement function for cast scalar type involved in Tensor
   // functions. E.g.,
   // template<S, D>
   // D CastTo(S x) {
   //   return D(x);
   // }
   // for speical types, e.g., fp16:
   // tempalte<>
   // fp16 CastType(float x) {
   //    ....
   // }
   if (axis == 0) {
     return Sum(M, 0) / (1.0f * M.shape(0));
   } else if (axis == 1) {
     return Sum(M, 1) / (1.0f * M.shape(1));
   } else {
     LOG(FATAL) << "Not currently support Sum over axis = " << axis;
   }
 }
 // TODO(wangwei) conside async exec
 template <>
 float Sum<float>(const Tensor &in) {
   float s = 0.0f;
   Tensor one(in.shape(), in.device(), in.data_type());
   one.SetValue(1.0f);
   TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
     one.device()->Exec(
         // cannot use this sum function in computational graph
         [in, one, &s](Context *ctx) mutable {
           DType ret = DType(0);
           Dot<DType, Lang>(in, one, &ret, ctx);
           s = ret;
         },
         {in.block(), one.block()}, {}, "Sum");
   });
   return s;
 }

 Tensor Sum(const Tensor &M, int axis) {
   if (axis == 0) {
     Tensor out(Shape{M.shape(1)}, M.device(), M.data_type());
     SumRows(M, &out);
     return out;
   } else {
     CHECK_EQ(axis, 1) << "Not support Sum over axis = " << axis;
     Tensor out = Tensor(Shape{M.shape(0)}, M.device(), M.data_type());
     SumColumns(M, &out);
     return out;
   }
 }

 Tensor SumAll(const Tensor &in) {
   Tensor out({(size_t)1}, in.device(), in.data_type());
   Tensor one(in.shape(), in.device(), in.data_type());
   one.SetValue(1.0f);
   TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
     one.device()->Exec(
         [in, one, out](Context *ctx) mutable {
           Dot<DType, Lang>(in, one, &out, ctx);
         },
         {in.block(), one.block()}, {out.block()}, "SumAll");
   });
   return out;
 }

 Tensor RowMax(const Tensor &in) {
   Tensor ret({in.shape(0)}, in.device(), in.data_type());
   TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
     in.device()->Exec(
         [in, ret](Context *ctx) mutable {
           // size_t nrow = 1;
           // if (in.nDim() > 1) nrow = in.shape(0);
           // size_t ncol = in.Size() / nrow;
           RowMax<DType, Lang>(in, &ret, ctx);
         },
         {in.block()}, {ret.block()}, "RowMax");
   });
   return ret;
 }

 void AddColumn(const Tensor &v, Tensor *M) { AddColumn(1, 1, v, M); }
 /// Add column 'v' onto each column of matrix M;
 template <typename SType>
 void AddColumn(const SType alpha, const SType beta, const Tensor &v,
                Tensor *M) {
   if (M->transpose()) {
     Tensor X(Transpose(*M));
     AddRow(v, &X);
   } else {
     CHECK_EQ(M->nDim(), 2u);
     // CHECK_EQ(v.nDim(), 1u); (chonho) shape of v is 2-element tuple
     size_t nb_row = M->shape(0), nb_col = M->shape(1);
     CHECK_EQ(nb_row, v.Size());

     Tensor one(Shape{1, nb_col}, M->device(), M->data_type());
     one.SetValue(1.0f);  // TODO(wangwei) cast type
     Tensor vmat(Reshape(v, Shape{nb_row, 1}));
     Mult(alpha, vmat, one, beta, M);
   }
 }
 template void AddColumn(const float alpha, const float beta, const Tensor &v,
                         Tensor *M);

 void AddRow(const Tensor &v, Tensor *M) { AddRow(1, 1, v, M); }

 /// Add row 'v' by each column of matrix M; write results into 'out'
 template <typename SType>
 void AddRow(const SType alpha, const SType beta, const Tensor &v, Tensor *M) {
   if (M->transpose()) {
     Tensor X(Transpose(*M));
     AddColumn(v, &X);
   } else {
     CHECK_EQ(M->nDim(), 2u);
     // CHECK_EQ(v.nDim(), 1u); (chonho) shape of v is 2-element tuple
     size_t nb_row = M->shape(0), nb_col = M->shape(1);
     CHECK_EQ(nb_col, v.Size());

     Tensor one(Shape{nb_row, 1}, M->device(), M->data_type());
     one.SetValue(1.0f);
     Tensor vmat(Reshape(v, Shape{1, nb_col}));
     Mult(alpha, one, vmat, beta, M);
   }
 }
 template void AddRow(const float alpha, const float beta, const Tensor &v,
                      Tensor *M);

 /// Divide column 'v' by each column of matrix M; write results into 'out'
 void DivColumn(const Tensor &v, Tensor *M) {
   Tensor inv;
   TYPE_SWITCH(v.data_type(), DType, { inv = Div(DType(1), v); });
   MultColumn(inv, M);
 }

 Tensor ConcatOn(const std::vector<Tensor> &in, int axis) {
   vector<Tensor> tmp;
   Shape out_shape = in[0].shape();
   size_t dim = in[0].shape().size();
   // CHECK_GE(dim, 2u) << " Only work for tensor of dim >=2 ";
   size_t size = in[0].Size() / in[0].shape(axis);
   size_t new_size = 0u;
   for (const auto &t : in) {
     CHECK_EQ(dim, t.shape().size()) << "All tensors should have the same dim";
     CHECK_EQ(size, t.Size() / t.shape(axis))
         << "The size of all axis should "
         << " be the same except the concatenated axis";
     new_size += t.shape(axis);
   }
   out_shape[axis] = new_size;
   if (axis == 0) {
     size_t nrow = 0;
     for (const auto &t : in) {
       nrow += t.shape(0);
       tmp.push_back(Reshape(t, {t.shape(0), t.Size() / t.shape(0)}));
     }
     auto ret = ConcatenateRows(tmp);
     ret.Reshape(out_shape);
     return ret;
   } else {
     for (const auto &t : in) {
       size_t nrow = 1;
       for (int i = 0; i < axis; i++) nrow *= t.shape(i);
       tmp.push_back(Reshape(t, {nrow, t.Size() / nrow}));
     }
     auto ret = ConcatenateColumns(tmp);
     ret.Reshape(out_shape);
     return ret;
   }
 }

 Tensor ConcatenateRows(const vector<Tensor> &in) {
   size_t nrow = 0, ncol = 0;
   CHECK(in.size());
   for (const auto &x : in) {
     CHECK(!x.transpose());
     CHECK_EQ(x.nDim(), 2u);
     nrow += x.shape(0);
     if (ncol == 0)
       ncol = x.shape(1);
     else
       CHECK_EQ(ncol, x.shape(1));
   }
   Tensor out(Shape{nrow, ncol}, in.at(0).device(), in.at(0).data_type());
   size_t dst_offset = 0;
   for (const auto &x : in) {
     CopyDataToFrom(&out, x, x.Size(), dst_offset, 0);
     dst_offset += x.Size();
   }
   return out;
 }
 Tensor ConcatRows(const vector<Tensor> &in) { return ConcatenateRows(in); }
 // TODO(wangwei) add a copypatch function for improve the efficiency on GPU.
 Tensor ConcatenateColumns(const vector<Tensor> &in) {
   size_t nrow = 0, ncol = 0;
   CHECK(in.size());
   for (const auto &x : in) {
     CHECK(!x.transpose());
     CHECK_EQ(x.nDim(), 2u);
     ncol += x.shape(1);
     if (nrow == 0)
       nrow = x.shape(0);
     else
       CHECK_EQ(nrow, x.shape(0));
   }
   Tensor out(Shape{nrow, ncol}, in.at(0).device(), in.at(0).data_type());
   for (size_t row = 0; row < nrow; row++) {
     size_t dst_offset = row * ncol;
     for (const auto &x : in) {
       size_t src_offset = row * x.shape(1);
       CopyDataToFrom(&out, x, x.shape(1), dst_offset, src_offset);
       dst_offset += x.shape(1);
     }
     CHECK_EQ(dst_offset, row * ncol + ncol);
   }
   return out;
 }
 Tensor ConcatColumns(const vector<Tensor> &in) {
   return ConcatenateColumns(in);
 }

 Tensor CopyRows(const Tensor &in, const size_t start, const size_t end) {
   CHECK_LT(start, end);
   CHECK_GE(in.shape(0), end) << "Tensor size must >= end";
   Shape s = in.shape();
   s[0] = end - start;
   size_t sample_size = in.Size() / in.shape(0);
   Tensor out(s, in.device(), in.data_type());
   CopyDataToFrom(&out, in, out.Size(), 0, start * sample_size);
   return out;
 }

 Tensor SliceOn(const Tensor &in, const size_t start, const size_t end,
                int axis) {
   Shape out_shape = in.shape();
   out_shape[axis] = end - start;
   if (axis == 0) {
     auto ret = SliceRows(Reshape(in, {in.shape(0), in.Size() / in.shape(0)}),
                          start, end);
     ret.Reshape(out_shape);
     return ret;
   } else {
     size_t nrow = 1;
     for (int i = 0; i < axis; i++) nrow *= in.shape(i);
     auto suffix = in.Size() / nrow / in.shape(axis);
     auto ret = SliceColumns(Reshape(in, {nrow, in.Size() / nrow}),
                             start * suffix, end * suffix);
     ret.Reshape(out_shape);
     return ret;
   }
 }

 Tensor SliceRows(const Tensor &in, const size_t start, const size_t end) {
   return CopyRows(in, start, end);
 }

 Tensor CopyColumns(const Tensor &in, const size_t start, const size_t end) {
   CHECK_EQ(in.nDim(), 2u);
   CHECK_LT(start, end);
   CHECK_GE(in.shape(1), end);
   Shape s{in.shape(0), end - start};
   Tensor out(s, in.device(), in.data_type());
   for (size_t row = 0; row < out.shape(0); row++) {
     size_t src_offset = row * in.shape(1) + start;
     size_t dst_offset = row * out.shape(1);
     CopyDataToFrom(&out, in, end - start, dst_offset, src_offset);
   }
   return out;
 }

 Tensor SliceColumns(const Tensor &in, const size_t start, const size_t end) {
   return CopyColumns(in, start, end);
 }

 /// Divide row 'v' by each row of matrix M; write results into 'out'
 void DivRow(const Tensor &v, Tensor *M) {
   Tensor inv;
   TYPE_SWITCH(v.data_type(), DType, { inv = Div(DType(1), v); });
   MultRow(inv, M);
 }

 /// Multiply column 'v' and each column of matrix M; write results into 'out'
 void MultColumn(const Tensor &v, Tensor *M) {
   // CHECK(!M->transpose()) << "Not supported yet";
   CHECK_EQ(M->nDim(), 2u);
   // CHECK_EQ(v.nDim(), 1u); (chonho) shape of v is 2-element tuple
   CHECK_EQ(v.Size(), M->shape(0));
   CheckDataTypeAndLang(*M, v);
   TYPE_LANG_SWITCH(v.data_type(), DType, v.device()->lang(), Lang, {
     Tensor &MRef = *M;
     v.device()->Exec(
         [MRef, v](Context *ctx) mutable {
           DGMM<DType, Lang>(false, MRef, v, &MRef, ctx);
         },
         {M->block(), v.block()}, {M->block()}, "MultColumn");
   });
 }

 /// Multiply row 'v' with each row of matrix M; write results into 'out'
 void MultRow(const Tensor &v, Tensor *M) {
   // CHECK(!M->transpose()) << "Not supported yet";
   CHECK_EQ(M->nDim(), 2u);
   // CHECK_EQ(v.nDim(), 1u); (chonho) shape of v is 2-element tuple
   CHECK_EQ(v.Size(), M->shape(1));
   CheckDataTypeAndLang(*M, v);
   TYPE_LANG_SWITCH(v.data_type(), DType, v.device()->lang(), Lang, {
     Tensor &MRef = *M;
     v.device()->Exec(
         [MRef, v](Context *ctx) mutable {
           DGMM<DType, Lang>(true, MRef, v, &MRef, ctx);
         },
         {M->block(), v.block()}, {M->block()}, "MultRow");
   });
 }

 void SubColumn(const Tensor &v, Tensor *M) { AddColumn(-1, 1, v, M); }

 void SubRow(const Tensor &v, Tensor *M) { AddRow(-1, 1, v, M); }

 void SumColumns(const Tensor &M, Tensor *v) {
   if (M.transpose()) {
     Tensor X = Transpose(M);
     SumRows(X, v);
   } else {
     CHECK_EQ(M.nDim(), 2u);
     // CHECK_EQ(v->nDim(), 1u); (chonho) shape of v is 2-element tuple
     size_t nb_row = M.shape().at(0), nb_col = M.shape().at(1);
     CHECK_EQ(nb_row, v->Size());

     Tensor one(Shape{nb_col}, M.device(), M.data_type());
     one.SetValue(1.0f);  // TODO(wangwei) cast type
     Mult(M, one, v);
   }
 }
 void SumRows(const Tensor &M, Tensor *v) {
   if (M.transpose()) {
     Tensor X = Transpose(M);
     SumColumns(X, v);
   } else {
     CHECK_EQ(M.nDim(), 2u);
     // CHECK_EQ(v->nDim(), 1u); (chonho) shape of v is 2-element tuple
     size_t nb_row = M.shape(0), nb_col = M.shape(1);
     CHECK_EQ(nb_col, v->Size());

     Tensor one(Shape{nb_row}, M.device(), M.data_type());
     one.SetValue(1.0f);  // TODO(wangwei) cast type
     Tensor X = Transpose(M);
     Mult(X, one, v);
   }
 }
 // ====================Random operations=====================================
 template <typename SType>
 void Bernoulli(const SType p, Tensor *out) {
   TYPE_LANG_SWITCH(out->data_type(), DType, out->device()->lang(), Lang, {
     auto prob = TypeCast<SType, DType>(p);
     Tensor &outRef = *out;
     out->device()->Exec(
         [prob, outRef](Context *ctx) mutable {
           Bernoulli<DType, Lang>(prob, &outRef, ctx);
         },
         {}, {out->block()}, "Bernoulli", true);
   });
 }

 template void Bernoulli<float>(const float p, Tensor *out);

 template <typename SType>
 void Uniform(const SType low, const SType high, Tensor *out) {
   TYPE_LANG_SWITCH(out->data_type(), DType, out->device()->lang(), Lang, {
     auto l = TypeCast<SType, DType>(low);
     auto h = TypeCast<SType, DType>(high);
     Tensor &outRef = *out;
     out->device()->Exec(
         [l, h, outRef](Context *ctx) mutable {
           Uniform<DType, Lang>(l, h, &outRef, ctx);
         },
         {}, {out->block()}, "Uniform", true);
   });
 }

 template void Uniform<float>(const float low, const float high, Tensor *out);

 template <typename SType>
 void Gaussian(const SType mean, const SType std, Tensor *out) {
   TYPE_LANG_SWITCH(out->data_type(), DType, out->device()->lang(), Lang, {
     auto m = TypeCast<SType, DType>(mean);
     auto s = TypeCast<SType, DType>(std);
     Tensor &outRef = *out;
     out->device()->Exec(
         [m, s, outRef](Context *ctx) mutable {
           Gaussian<DType, Lang>(m, s, &outRef, ctx);
         },
         {}, {out->block()}, "Gaussian", true);
   });
 }
 template void Gaussian<float>(const float mean, const float std, Tensor *out);

 // ================Blas operations============================================

 template <typename SType>
 void Axpy(const SType alpha, const Tensor &in, Tensor *out) {
   TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
     auto a = TypeCast<SType, DType>(alpha);
     Tensor &outRef = *out;
     Tensor fake(*out);
     out->device()->Exec(
         [a, in, outRef, fake](Context *ctx) mutable {
           Axpy<DType, Lang>(a, in, &outRef, ctx);
         },
         {in.block(), out->block()}, {out->block()}, "Axpy");
   });
 }

 template void Axpy<float>(const float alpha, const Tensor &in, Tensor *out);

 void Axpy(const Tensor &alpha, const Tensor &in, Tensor *out) {
     TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
       Tensor fake(*out);
       Tensor &outRef = *out;
       out->device()->Exec(
           [alpha, in, outRef, fake](Context *ctx) mutable {
             Axpy<DType, Lang>(alpha, in, &outRef, ctx);
           },
           {alpha.block(), in.block(), out->block()}, {out->block()}, "Axpy");
     });
 }

 Tensor Mult(const Tensor &A, const Tensor &B) {
   auto A_ = Broadcast(A, B.shape(), 2);
   auto B_ = Broadcast(B, A.shape(), 2);

   Shape s = A_.shape();
   s.pop_back();
   s.push_back(B.shape(B.nDim() - 1));

   Tensor out(s, A.device(), A.data_type());
   Mult(A_, B_, &out);
   return out;
 }

 void Mult(const Tensor &A, const Tensor &B, Tensor *out) {
   Mult(1.0f, A, B, 0.0f, out);
 }

 template <typename SType>
 void Mult(const SType alpha, const Tensor &A, const Tensor &B, const SType beta,
           Tensor *C) {
   Tensor fakeC;
   vector<Block *> read_blocks = {A.block(), B.block()};
   if (beta) {
     fakeC = *C;
     read_blocks.push_back(C->block());
   }
   if (B.nDim() == 1u) {
     CHECK_EQ(A.shape().size(), 2u);
     TYPE_LANG_SWITCH(A.data_type(), DType, A.device()->lang(), Lang, {
       auto a = TypeCast<SType, DType>(alpha);
       auto b = TypeCast<SType, DType>(beta);
       Tensor &CRef = *C;
       C->device()->Exec(
           [a, A, b, B, CRef, fakeC](Context *ctx) mutable {
             GEMV<DType, Lang>(a, A, B, b, &CRef, ctx);
           },
           read_blocks, {C->block()}, "GEMV");
     });
   } else if (B.nDim() == 2u) {
     CHECK_EQ(A.shape().size(), 2u);
     CHECK(!C->transpose());
     TYPE_LANG_SWITCH(A.data_type(), DType, A.device()->lang(), Lang, {
       auto a = TypeCast<SType, DType>(alpha);
       auto b = TypeCast<SType, DType>(beta);
       Tensor &CRef = *C;
       C->device()->Exec(
           [a, A, b, B, CRef, fakeC](Context *ctx) mutable {
             GEMM<DType, Lang>(a, A, B, b, &CRef, ctx);
           },
           read_blocks, {C->block()}, "GEMM");
     });
   } else if (B.nDim() == 3u || B.nDim() == 4u) {
     CHECK_EQ(A.shape().size(), B.shape().size());
     CHECK(!C->transpose());
     TYPE_LANG_SWITCH(A.data_type(), DType, A.device()->lang(), Lang, {
       auto a = TypeCast<SType, DType>(alpha);
       auto b = TypeCast<SType, DType>(beta);

       Tensor A_tmp;
       Tensor B_tmp;

       if (A.transpose() || A.broadcasted()) {
         A_tmp = Tensor(A.shape(), A.device(), A.data_type());
         singa::Transform(A, &A_tmp);
       } else {
         A_tmp = A;
       }

       if (B.transpose() || B.broadcasted()) {
         B_tmp = Tensor(B.shape(), B.device(), B.data_type());
         singa::Transform(B, &B_tmp);
       } else {
         B_tmp = B;
       }

       // batch GEMM should have same batch size
       CHECK_EQ(A_tmp.shape(0), B_tmp.shape(0));
       if (B.nDim() == 4u) CHECK_EQ(A_tmp.shape(1), B_tmp.shape(1));

       Tensor &CRef = *C;
       C->device()->Exec(
           [a, A_tmp, b, B_tmp, CRef, fakeC](Context *ctx) mutable {
             GEMMBatched<DType, Lang>(a, A_tmp, B_tmp, b, &CRef, ctx);
           },
           read_blocks, {C->block()}, "GEMMBatched");
     });
   } else {
     LOG(FATAL) << "Un-supported tensor dimentions " << A.nDim() << "d matmul "
                << B.nDim() << "d\n";
   }
 }

 // ************************
 // Misc.
 // ************************
 Tensor CrossEntropyFwd(const Tensor &p, const Tensor &t) {
   Tensor loss({p.shape(0)}, p.device(), p.data_type());
   ComputeCrossEntropy(p, t, &loss);
   return loss;
 }

 Tensor SoftmaxCrossEntropyBwd(const Tensor &p, const Tensor &t) {
   Tensor g = p.Clone();
   SoftmaxCrossEntropyBwd(t, &g);
   return g;
 }

 void ComputeCrossEntropy(const Tensor &p, const Tensor &t, Tensor *loss) {
   CHECK_LE(p.nDim(), 2u);
   CHECK_LE(t.nDim(), 2u);
   size_t batchsize = 1;
   if (p.nDim() == 2u) batchsize = p.shape(0);
   size_t dim = p.Size() / batchsize;
   TYPE_LANG_SWITCH(p.data_type(), DType, p.device()->lang(), Lang, {
     Tensor &lossRef = *loss;
     p.device()->Exec(
         [batchsize, dim, t, p, lossRef](Context *ctx) mutable {
           bool int_target = t.Size() == batchsize;
           ComputeCrossEntropy<DType, Lang>(int_target, batchsize, dim, p, t,
                                            &lossRef, ctx);
         },
         {p.block(), t.block()}, {loss->block()}, "ComputeCrossEntropy");
   });
 }

 void SoftmaxCrossEntropyBwd(const Tensor &t, Tensor *p) {
   CHECK_LE(p->nDim(), 2u);
   CHECK_LE(t.nDim(), 2u);
   size_t batchsize = 1;
   if (p->nDim() == 2u) batchsize = p->shape(0);
   size_t dim = p->Size() / batchsize;
   TYPE_LANG_SWITCH(p->data_type(), DType, p->device()->lang(), Lang, {
     Tensor &pRef = *p;
     Tensor pFake(*p);  // just add a ref count
     p->device()->Exec(
         [batchsize, dim, t, pRef, pFake, p](Context *ctx) mutable {
           bool int_target = t.Size() == batchsize;
           SoftmaxCrossEntropyBwd<DType, Lang>(int_target, batchsize, dim, pRef,
                                               t, &pRef, ctx);
         },
         {p->block(), t.block()}, {p->block()}, "SoftmaxCrossEntropyBackward");
   });
 }

 Tensor &Tensor::Contiguous() {
   if (transpose()) {
     Tensor t(shape_, device_, data_type_);
     singa::Transform(*this, &t);
     std::swap(t.block_, block_);
   }
   return *this;
 }

 Tensor Contiguous(const Tensor &in) {
   Tensor out(in);
   return out.Contiguous();
 }

 // if tensor is not transposed yet, we change the shape and generate new stride
 // if tensor is already transposed, we reallocate the memory and generate stride
 Tensor &Tensor::Reshape(const Shape &shape) {
   // Check original volumn with the new one
   // do not use Product(shape_) due to stride 0 from broadcasting.
   // printf("reshape loc b\n");
   CHECK_EQ(Product(shape), Size());
   if (transpose()) {
     Tensor t(shape_, device_, data_type_);
     singa::Transform(*this, &t);
     std::swap(t.block_, block_);
     shape_ = shape;
   } else {
     shape_ = shape;
   }
   generate_stride();
   // printf("reshape loc c\n");
   return *this;
 }

 Tensor Reshape(const Tensor &in, const Shape &s) {
   // printf("reshape loc a\n");
   Tensor out(in);
   return out.Reshape(s);
 }

 }  // namespace singa