Merge pull request #872 from nudles/dev

Prepare for v3.2.0 release
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1d8201f..a722ec9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -29,10 +29,10 @@
 #string(REGEX REPLACE "^[0-9]+\\.[0-9]+\\.([0-9]+).*" "\\1" VERSION_PATCH "${VERSION}")
 
 
-SET(PACKAGE_VERSION 3.1.0) # ${VERSION})
-SET(VERSION 3.1.0)
+SET(PACKAGE_VERSION 3.2.0) # ${VERSION})
+SET(VERSION 3.2.0)
 SET(SINGA_MAJOR_VERSION 3)
-SET(SINGA_MINOR_VERSION 1)
+SET(SINGA_MINOR_VERSION 2)
 SET(SINGA_PATCH_VERSION 0)
 #SET(SINGA_MAJOR_VERSION ${VERSION_MAJOR})  # 0 -
 #SET(SINGA_MINOR_VERSION ${VERSION_MINOR})  # 0 - 9
diff --git a/python/singa/tensor.py b/python/singa/tensor.py
index e9e9ae7..963ad1a 100755
--- a/python/singa/tensor.py
+++ b/python/singa/tensor.py
@@ -674,72 +674,65 @@
         if isinstance(rhs, Tensor):
             return from_raw_tensor(singa.__add__(self.data, rhs.data))
         else:
-            return _call_singa_func(singa.AddFloat, self.data, rhs)
-
+            return _call_singa_func(singa.AddFloat, self.data, float(rhs))
+            
     def __sub__(self, rhs):
         if isinstance(rhs, Tensor):
             return from_raw_tensor(singa.__sub__(self.data, rhs.data))
         else:
-            return _call_singa_func(singa.SubFloat, self.data, rhs)
-
+            return _call_singa_func(singa.SubFloat, self.data, float(rhs))
+            
     def __mul__(self, rhs):
         if isinstance(rhs, Tensor):
             return from_raw_tensor(singa.__mul__(self.data, rhs.data))
         else:
-            return _call_singa_func(singa.MultFloat, self.data, rhs)
-
+            return _call_singa_func(singa.MultFloat, self.data, float(rhs))
+            
     def __div__(self, rhs):
         if isinstance(rhs, Tensor):
             return from_raw_tensor(singa.__div__(self.data, rhs.data))
         else:
-            return _call_singa_func(singa.DivFloat, self.data, rhs)
-
+            return _call_singa_func(singa.DivFloat, self.data, float(rhs))
+            
     def __truediv__(self, rhs):
-        if isinstance(rhs, Tensor):
-            return from_raw_tensor(singa.__div__(self.data, rhs.data))
-        else:
-            return _call_singa_func(singa.DivFloat, self.data, rhs)
-
+        return self.__div__(rhs)
+    
     def __floordiv__(self, rhs):
-        if isinstance(rhs, Tensor):
-            tmp = from_raw_tensor(singa.__div__(self.data, rhs.data))
-            return _call_singa_func(singa.Floor, tmp.data)
-        else:
-            tmp = _call_singa_func(singa.DivFloat, self.data, rhs)
-            return _call_singa_func(singa.Floor, tmp.data)
+        tmp = self.__div__(rhs)
+        return _call_singa_func(singa.Floor, tmp.data)
 
     def __lt__(self, rhs):
         if isinstance(rhs, Tensor):
             return from_raw_tensor(singa.__lt__(self.data, rhs.data))
         else:
-            return _call_singa_func(singa.LTFloat, self.data, rhs)
-
+            return _call_singa_func(singa.LTFloat, self.data, float(rhs))
+            
     def __le__(self, rhs):
         if isinstance(rhs, Tensor):
             return from_raw_tensor(singa.__le__(self.data, rhs.data))
         else:
-            return _call_singa_func(singa.LEFloat, self.data, rhs)
-
+            return _call_singa_func(singa.LEFloat, self.data, float(rhs))
+            
     def __gt__(self, rhs):
         if isinstance(rhs, Tensor):
             return from_raw_tensor(singa.__gt__(self.data, rhs.data))
         else:
-            return _call_singa_func(singa.GTFloat, self.data, rhs)
-
+            return _call_singa_func(singa.GTFloat, self.data, float(rhs))
+            
     def __ge__(self, rhs):
         if isinstance(rhs, Tensor):
             return from_raw_tensor(singa.__ge__(self.data, rhs.data))
         else:
-            return _call_singa_func(singa.GEFloat, self.data, rhs)
-
+            return _call_singa_func(singa.GEFloat, self.data, float(rhs))
+            
     def __eq__(self, rhs):
         if isinstance(rhs, Tensor):
             return from_raw_tensor(singa.__eq__(self.data, rhs.data))
         elif rhs is None:
             return False
         else:
-            return _call_singa_func(singa.EQFloat, self.data, rhs)
-
+            return _call_singa_func(singa.EQFloat, self.data, float(rhs))
+            
     def __radd__(self, lhs):
         lhs = float(lhs)
         one = Tensor(self.shape, self.device, self.dtype)
diff --git a/setup.py b/setup.py
index f7a3f19..4e9a644 100644
--- a/setup.py
+++ b/setup.py
@@ -83,7 +83,7 @@
 from datetime import date
 
 # stable version
-VERSION = '3.1.0.rc1'
+VERSION = '3.2.0'
 # get the git hash
 # git_hash = subprocess.check_output(["git", "describe"]).strip().split('-')[-1][1:]
 # comment the next line to build wheel for stable version
@@ -267,9 +267,9 @@
     extra_compile_args = {'gcc': get_cpp_flags()}
 
     if with_cuda:
-        cuda9_gencode = (' -gencode arch=compute_35,code=sm_35'
-                         ' -gencode arch=compute_50,code=sm_50'
-                         ' -gencode arch=compute_60,code=sm_60'
+        # compute_35 and compute_50 are removed because 1. they do not support half float;
+        # 2. google colab's GPU has been updated from K80 (compute_35) to T4 (compute_75).
+        cuda9_gencode = (' -gencode arch=compute_60,code=sm_60'
                          ' -gencode arch=compute_70,code=sm_70')
         cuda10_gencode = ' -gencode arch=compute_75,code=sm_75'
         cuda11_gencode = ' -gencode arch=compute_80,code=sm_80'
@@ -422,6 +422,7 @@
         'numpy >=1.16,<2.0',  #1.16
         'onnx==1.6',
         'deprecated',
+        'pytest',
         'unittest-xml-reporting',
         'future',
         'pillow',
diff --git a/src/core/tensor/tensor.cc b/src/core/tensor/tensor.cc
index 08e5d41..46b82aa 100644
--- a/src/core/tensor/tensor.cc
+++ b/src/core/tensor/tensor.cc
@@ -1037,33 +1037,59 @@
     });                                                                    \
   } while (0)
 
-#define GenBinaryTensorFn(op, fn)                              \
-  Tensor op(const Tensor &lhs, const Tensor &rhs) {            \
-    if (lhs.shape() != rhs.shape()) {                          \
-      auto lhs_ = Broadcast(lhs, rhs.shape());                 \
-      auto rhs_ = Broadcast(rhs, lhs.shape());                 \
-      Tensor ret(lhs_.shape(), lhs.device(), lhs.data_type()); \
-      fn(lhs_, rhs_, &ret);                                    \
-      return ret;                                              \
-    } else {                                                   \
-      Tensor ret(lhs.shape(), lhs.device(), lhs.data_type());  \
-      fn(lhs, rhs, &ret);                                      \
-      return ret;                                              \
-    }                                                          \
-  }                                                            \
-  void fn(const Tensor &lhs, const Tensor &rhs, Tensor *ret) { \
-    CHECK_EQ(lhs.device(), ret->device());                     \
-    CHECK_EQ(rhs.device(), ret->device());                     \
-    if (lhs.shape() != rhs.shape()) {                          \
-      auto lhs_ = Broadcast(lhs, rhs.shape());                 \
-      auto rhs_ = Broadcast(rhs, lhs.shape());                 \
-      CHECK(lhs_.shape() == ret->shape());                     \
-      EltwiseBinaryTensorFn(fn, lhs_, rhs_, ret);              \
-    } else {                                                   \
-      CHECK(lhs.shape() == ret->shape());                      \
-      EltwiseBinaryTensorFn(fn, lhs, rhs, ret);                \
-    }                                                          \
-  }  // namespace singa
+#define GenBinaryTensorFn(op, fn)                                           \
+  Tensor op(const Tensor &lhs, const Tensor &rhs) {                         \
+    if (lhs.shape() != rhs.shape()) {                                       \
+      if (lhs.data_type() == kFloat32 && rhs.data_type() == kFloat32) {     \
+        auto lhs_ = Broadcast(lhs, rhs.shape());                            \
+        auto rhs_ = Broadcast(rhs, lhs.shape());                            \
+        Tensor ret(lhs_.shape(), lhs.device(), lhs.data_type());            \
+        fn(lhs_, rhs_, &ret);                                               \
+        return ret;                                                         \
+      } else {                                                              \
+        /* lhs tensor and rhs tensor are not both in float, cast to float */\
+        Tensor tmp_lhs = lhs.Clone().AsType(kFloat32);                      \
+        Tensor tmp_rhs = rhs.Clone().AsType(kFloat32);                      \
+        tmp_lhs = Broadcast(tmp_lhs, tmp_rhs.shape());                      \
+        tmp_rhs = Broadcast(tmp_rhs, tmp_lhs.shape());                      \
+        Tensor ret(tmp_lhs.shape(), tmp_lhs.device(), tmp_lhs.data_type()); \
+        fn(tmp_lhs, tmp_rhs, &ret);                                         \
+        /* if lhs and rhs are both int, cast back to int */                 \
+        if (lhs.data_type() == kInt && rhs.data_type() == kInt)             \
+          return ret.Clone().AsType(kInt);                                  \
+        return ret;                                                         \
+      }                                                                     \
+    } else {                                                                \
+      if (lhs.data_type() == kFloat32 && rhs.data_type() == kFloat32) {     \
+        Tensor ret(lhs.shape(), lhs.device(), lhs.data_type());             \
+        fn(lhs, rhs, &ret);                                                 \
+        return ret;                                                         \
+      } else {                                                              \
+        /* lhs tensor and rhs tensor are not both in float, cast to float */\
+        Tensor tmp_lhs = lhs.Clone().AsType(kFloat32);                      \
+        Tensor tmp_rhs = rhs.Clone().AsType(kFloat32);                      \
+        Tensor ret(tmp_lhs.shape(), tmp_lhs.device(), tmp_lhs.data_type()); \
+        fn(tmp_lhs, tmp_rhs, &ret);                                         \
+        /* if lhs and rhs are both int, cast back to int */                 \
+        if (lhs.data_type() == kInt && rhs.data_type() == kInt)             \
+          return ret.Clone().AsType(kInt);                                  \
+        return ret;                                                         \
+      }                                                                     \
+    }                                                                       \
+  }                                                                         \
+  void fn(const Tensor &lhs, const Tensor &rhs, Tensor *ret) {              \
+    CHECK_EQ(lhs.device(), ret->device());                                  \
+    CHECK_EQ(rhs.device(), ret->device());                                  \
+    if (lhs.shape() != rhs.shape()) {                                       \
+      auto lhs_ = Broadcast(lhs, rhs.shape());                              \
+      auto rhs_ = Broadcast(rhs, lhs.shape());                              \
+      CHECK(lhs_.shape() == ret->shape());                                  \
+      EltwiseBinaryTensorFn(fn, lhs_, rhs_, ret);                           \
+    } else {                                                                \
+      CHECK(lhs.shape() == ret->shape());                                   \
+      EltwiseBinaryTensorFn(fn, lhs, rhs, ret);                             \
+    }                                                                       \
+  }
 
 // boradcasting operations:
 // https://github.com/onnx/onnx/blob/master/docs/Broadcasting.md
@@ -1093,12 +1119,29 @@
   } while (0)
 
 #define GenTensorScalarFn(op, fn)                                          \
-  template <typename SType>                                   \
-  Tensor op(const Tensor &in, const SType x) {                \
-    Tensor ret(in.shape(), in.device(), in.data_type());      \
-    fn(in, x, &ret);                                          \
-    return ret;                                               \
-  }                                                                         \
+  template <typename SType>                                                \
+  Tensor op(const Tensor &in, const SType x) {                             \
+    if (in.data_type() == kFloat32 && std::is_same<SType, float>::value){  \
+      Tensor ret(in.shape(), in.device(), in.data_type());                 \
+      fn(in, x, &ret);                                                     \
+      return ret;                                                          \
+    } else if (in.data_type() == kFloat32) {                               \
+      Tensor ret(in.shape(), in.device(), in.data_type());                 \
+      float tmp_x = x;                                                     \
+      fn(in, tmp_x, &ret);                                                 \
+      return ret;                                                          \
+    } else {                                                               \
+      /* tensor and scalar are not both in float, cast to float */         \
+      Tensor tmp_in = in.Clone().AsType(kFloat32);                         \
+      float tmp_x = x;                                                     \
+      Tensor ret(tmp_in.shape(), tmp_in.device(), tmp_in.data_type());     \
+      fn(tmp_in, tmp_x, &ret);                                             \
+      /* if tensor and scalar are both int, cast back to int */            \
+      if (in.data_type() == kInt && std::is_same<SType, int>::value)       \
+        return ret.Clone().AsType(kInt);                                   \
+      return ret;                                                          \
+    }                                                                      \
+  }                                                                        \
   template <typename SType>                                                \
   void fn(const Tensor &in, const SType x, Tensor *ret) {                  \
     EltwiseTensorScalarFn(fn, in, x, ret);                                 \
diff --git a/src/core/tensor/tensor_math.h b/src/core/tensor/tensor_math.h
index 2e6a08a..c294c4e 100644
--- a/src/core/tensor/tensor_math.h
+++ b/src/core/tensor/tensor_math.h
@@ -55,6 +55,9 @@
 /// 7. Use size_t for the number of elements, rows or columns.
 /// 8. Use the same name for the Tensor and Tensor level math functions.
 
+#define LOG_FATAL(Op, DType, Lang)  \
+  LOG(FATAL) << Op << " not Implemented for DType=" << typeid(DType).name() << " Lang=" << typeid(Lang).name()
+
 const std::string vec2str(const std::vector<int> &vec) {
   std::ostringstream vts;
   if (!vec.empty()) {
@@ -83,62 +86,63 @@
 /// out[i] = |in[i]|
 template <typename DType, typename Lang>
 void Abs(const Tensor &in, Tensor *out, Context *ctx) {
-  LOG(FATAL) << "Abs Not Implemented";
+  LOG_FATAL("Abs", DType, Lang);
 }
 
 template <typename DType, typename Lang>
 void Erf(const Tensor &in, Tensor *out, Context *ctx) {
-  LOG(FATAL) << "Erf Not Implemented";
+  LOG_FATAL("Erf", DType, Lang);
 }
 
 template <typename DTypeSrc, typename DTypeDst, typename Lang>
 void CastCopy(const Tensor *src, Tensor *dst, Context *ctx) {
-  LOG(FATAL) << "CastCopy Not Implemented";
+  LOG(FATAL) << "CastCopy not Implemented for DTypeSrc=" << typeid(DTypeSrc).name() 
+             << " DTypeDst=" << typeid(DTypeDst).name() << " Lang=" << typeid(Lang).name();
 }
 
 template <typename DType, typename Lang>
 void Ceil(const Tensor &in, Tensor *out, Context *ctx) {
-  LOG(FATAL) << "Ceil Not Implemented";
+  LOG_FATAL("Ceil", DType, Lang);
 }
 
 template <typename DType, typename Lang>
 void Floor(const Tensor &in, Tensor *out, Context *ctx) {
-  LOG(FATAL) << "Floor Not Implemented";
+  LOG_FATAL("Floor", DType, Lang);
 }
 
 template <typename DType, typename Lang>
 void Round(const Tensor &in, Tensor *out, Context *ctx) {
-  LOG(FATAL) << "Round Not Implemented";
+  LOG_FATAL("Round", DType, Lang);
 }
 
 template <typename DType, typename Lang>
 void RoundE(const Tensor &in, Tensor *out, Context *ctx) {
-  LOG(FATAL) << "Round Not Implemented";
+  LOG_FATAL("RoundE", DType, Lang);
 }
 
 /// out[i] = in[i] + x
 template <typename DType, typename Lang>
 void Add(const Tensor &in, const DType x, Tensor *out, Context *ctx) {
-  LOG(FATAL) << "Add Not Implemented";
+  LOG_FATAL("Add", DType, Lang);
 }
 
 /// out[i] = in1[i] + in2[i]
 template <typename DType, typename Lang>
 void Add(const Tensor &in1, const Tensor &in2, Tensor *out, Context *ctx) {
-  LOG(FATAL) << "Add-Pair Not Implemented";
+  LOG_FATAL("Add-Pair", DType, Lang);
 }
 /// Clamp every element into [low, high]
 /// if in[i]>high, then out[i]=high; if in[i]<low, then out[i]=low.
 template <typename DType, typename Lang>
 void Clamp(const DType low, const DType high, const Tensor &in, Tensor *out,
            Context *ctx) {
-  LOG(FATAL) << "Clamp Not Implemented";
+  LOG_FATAL("Clamp", DType, Lang);
 }
 
 /// out[i] = x / in[i]
 template <typename DType, typename Lang>
 void Div(const DType x, const Tensor &in, Tensor *out, Context *ctx) {
-  LOG(FATAL) << "Div Not Implemented";
+  LOG_FATAL("Div", DType, Lang);
 }
 
 /// out[i] = in[i] / x
@@ -151,140 +155,140 @@
 /// out[i] = in1[i] / in2[i]
 template <typename DType, typename Lang>
 void Div(const Tensor &in1, const Tensor &in2, Tensor *out, Context *ctx) {
-  LOG(FATAL) << "Div-Pair Not Implemented";
+  LOG_FATAL("Div-Pair", DType, Lang);
 }
 
 /// out[i] = in[i] * x
 template <typename DType, typename Lang>
 void EltwiseMult(const Tensor &in, const DType x, Tensor *out, Context *ctx) {
-  LOG(FATAL) << "EltwiseMult Not Implemented";
+  LOG_FATAL("EltwiseMult", DType, Lang);
 }
 
 /// out[i] = in1[i] * in2[i]
 template <typename DType, typename Lang>
 void EltwiseMult(const Tensor &in1, const Tensor &in2, Tensor *out,
                  Context *ctx) {
-  LOG(FATAL) << "EltwiseMult-Pair Not Implemented";
+  LOG_FATAL("EltwiseMult-Pair", DType, Lang);
 }
 
 /// out[i]=(in2[i]>0)?in1[i]:0.f
 template <typename DType, typename Lang>
 void ReLUBackward(const Tensor &in1, const Tensor &in2, Tensor *out,
                   Context *ctx) {
-  LOG(FATAL) << "ReLUBackward Not Implemented";
+  LOG_FATAL("ReLUBackward", DType, Lang);
 }
 
 /// Base is e, Neper number. out[i]=exp(in[i])
 template <typename DType, typename Lang>
 void Exp(const Tensor &in, Tensor *out, Context *ctx) {
-  LOG(FATAL) << "Exp Not Implemented";
+  LOG_FATAL("Exp", DType, Lang);
 }
 
 /// out[i]=(in[i]<=x)?1.f:0.f
 template <typename DType, typename Lang>
 void LE(const Tensor &in, const DType x, Tensor *out, Context *ctx) {
-  LOG(FATAL) << "LE Not Implemented";
+  LOG_FATAL("LE", DType, Lang);
 }
 /// out[i]=(in1[i]<=in2[i])?1.f:0.f
 template <typename DType, typename Lang>
 void LE(const Tensor &in1, const Tensor &in2, Tensor *out, Context *ctx) {
-  LOG(FATAL) << "Tensor-Tensor LE Not Implemented";
+  LOG_FATAL("Tensor <= Tensor", DType, Lang);
 }
 /// Natural logarithm, the base is e, Neper number out[i]=log(in[i]).
 template <typename DType, typename Lang>
 void Log(const Tensor &in, Tensor *out, Context *ctx) {
-  LOG(FATAL) << "Log Not Implemented";
+  LOG_FATAL("Log", DType, Lang);
 }
 /// out[i]=(in[i]<x)?1.f:0.f
 template <typename DType, typename Lang>
 void LT(const Tensor &in, const DType x, Tensor *out, Context *ctx) {
-  LOG(FATAL) << "LT Not Implemented";
+  LOG_FATAL("LT", DType, Lang);
 }
 /// out[i]=(in1[i]<in2[i])?1.f:0.f
 template <typename DType, typename Lang>
 void LT(const Tensor &in1, const Tensor &in2, Tensor *out, Context *ctx) {
-  LOG(FATAL) << "Tensor-Tensor LT Not Implemented";
+  LOG_FATAL("Tensor Tensor LT", DType, Lang);
 }
 /// out[i]=(in[i]>=x)?1.f:0.f
 template <typename DType, typename Lang>
 void GE(const Tensor &in, const DType x, Tensor *out, Context *ctx) {
-  LOG(FATAL) << "GE Not Implemented";
+  LOG_FATAL("GE", DType, Lang);
 }
 /// out[i]=(in1[i]>=in2[i])?1.f:0.f
 template <typename DType, typename Lang>
 void GE(const Tensor &in1, const Tensor &in2, Tensor *out, Context *ctx) {
-  LOG(FATAL) << "Tensor-Tensor GE Not Implemented";
+  LOG_FATAL("Tensor Tensor GE", DType, Lang);
 }
 /// out[i]=(in[i]>x)?1.f:0.f
 template <typename DType, typename Lang>
 void GT(const Tensor &in, const DType x, Tensor *out, Context *ctx) {
-  LOG(FATAL) << "GT Not Implemented";
+  LOG_FATAL("GT", DType, Lang);
 }
 /// out[i]=(in[i]>in2[i])?1.f:0.f
 template <typename DType, typename Lang>
 void GT(const Tensor &in, const Tensor &in2, Tensor *out, Context *ctx) {
-  LOG(FATAL) << "Tensor-Tensor GT Not Implemented";
+  LOG_FATAL("Tensor Tensor GT", DType, Lang);
 }
 /// out[i]=(in[i]==x)?1.f:0.f
 template <typename DType, typename Lang>
 void EQ(const Tensor &in, const DType x, Tensor *out, Context *ctx) {
-  LOG(FATAL) << "EQ Not Implemented";
+  LOG_FATAL("EQ", DType, Lang);
 }
 /// out[i]=(in[i]==in2[i])?1.f:0.f
 template <typename DType, typename Lang>
 void EQ(const Tensor &in, const Tensor &in2, Tensor *out, Context *ctx) {
-  LOG(FATAL) << "Tensor-Tensor EQ Not Implemented";
+  LOG_FATAL("Tensor Tensor EQ", DType, Lang);
 }
 /// out[i] = pow(in[i], x)
 template <typename DType, typename Lang>
 void Pow(const Tensor &in, const DType x, Tensor *out, Context *ctx) {
-  LOG(FATAL) << "Pow Not Implemented";
+  LOG_FATAL("Pow", DType, Lang);
 }
 
 /// out[i]=pow(in1[i], in2[i])
 template <typename DType, typename Lang>
 void Pow(const Tensor &in1, const Tensor &in2, Tensor *out, Context *ctx) {
-  LOG(FATAL) << "Pow-Pair Not Implemented";
+  LOG_FATAL("Tensor Tensor Pow", DType, Lang);
 }
 
 /// out[i]=max(0, in[i])
 template <typename DType, typename Lang>
 void ReLU(const Tensor &in, Tensor *out, Context *ctx) {
-  LOG(FATAL) << "ReLU Not Implemented";
+  LOG_FATAL("ReLU", DType, Lang);
 }
 
 /// out[i] = x
 template <typename DType, typename Lang>
 void Set(const DType x, Tensor *out, Context *ctx) {
-  LOG(FATAL) << "Set Not Implemented";
+  LOG_FATAL("Set", DType, Lang);
 }
 /// out[i]=sigmoid(in[i])
 template <typename DType, typename Lang>
 void Sigmoid(const Tensor &in, Tensor *out, Context *ctx) {
-  LOG(FATAL) << "Sigmoid Not Implemented";
+  LOG_FATAL("Sigmoid", DType, Lang);
 }
 
 /// out[i] = log(exp(in[i]) + 1)
 template <typename DType, typename Lang>
 void SoftPlus(const Tensor &in, Tensor *out, Context *ctx) {
-  LOG(FATAL) << "SoftPlus Not Implemented";
+  LOG_FATAL("SoftPlus", DType, Lang);
 }
 
 /// out[i] = in[i] / (abs(in[i]) + 1)
 template <typename DType, typename Lang>
 void SoftSign(const Tensor &in, Tensor *out, Context *ctx) {
-  LOG(FATAL) << "SoftSign Not Implemented";
+  LOG_FATAL("SoftSign", DType, Lang);
 }
 
 /// out[i] = sign(in[i])
 template <typename DType, typename Lang>
 void Sign(const Tensor &in, Tensor *out, Context *ctx) {
-  LOG(FATAL) << "Sign Not Implemented";
+  LOG_FATAL("Sign", DType, Lang);
 }
 /// out[i]=sqrt(in[i])
 template <typename DType, typename Lang>
 void Sqrt(const Tensor &in, Tensor *out, Context *ctx) {
-  LOG(FATAL) << "Sqrt Not Implemented";
+  LOG_FATAL("Sqrt", DType, Lang);
 }
 
 /// out[i]=square(in[i])
@@ -302,13 +306,13 @@
 /// out[i] = in1[i] - in2[i]
 template <typename DType, typename Lang>
 void Sub(const Tensor &in1, const Tensor &in2, Tensor *out, Context *ctx) {
-  LOG(FATAL) << "Sub-Pair Not Implemented";
+  LOG_FATAL("Tensor Tensor Sub", DType, Lang);
 }
 
 /// sum all elements of in into out
 template <typename DType, typename Lang>
 void Sum(const Tensor &in, DType *out, Context *ctx) {
-  LOG(FATAL) << "Sum Not Implemented";
+  LOG_FATAL("Sum", DType, Lang);
 }
 
 /// out[i]=fn(in[i])
@@ -316,8 +320,7 @@
   template <typename DType, typename Lang>               \
   void fn(const Tensor &in, Tensor *out, Context *ctx) { \
     std::string str = stringfn;                          \
-    str += " Not Implemented";                           \
-    LOG(FATAL) << str;                                   \
+    LOG_FATAL(str, DType, Lang);                         \
   }
 
 GenUnaryNotImplemented(Cos, "Cos");
@@ -339,7 +342,7 @@
 /// strides
 template <typename DType, typename Lang>
 void Transform(const Tensor &in, Tensor *out, Context *ctx) {
-  LOG(FATAL) << "Transform Not Implemented";
+  LOG_FATAL("Transform", DType, Lang);
 }
 
 // **************************************
@@ -350,19 +353,19 @@
 // If DType is not float, then convert the threshold to DType
 template <typename DType, typename Lang>
 void Bernoulli(const float p, Tensor *out, Context *ctx) {
-  LOG(FATAL) << "Bernoulli Not Implemented";
+  LOG_FATAL("Bernoulli", DType, Lang);
 }
 // The random generator should be extracted from ctx.
 // If DType is not float, then convert the mean and std to DType
 template <typename DType, typename Lang>
 void Gaussian(const DType mean, const DType std, Tensor *out, Context *ctx) {
-  LOG(FATAL) << "Gaussian Not Implemented";
+  LOG_FATAL("Gaussian", DType, Lang);
 }
 // The random generator should be extracted from ctx.
 // If DType is not float, then convert the low and high to DType
 template <typename DType, typename Lang>
 void Uniform(const DType low, const DType high, Tensor *out, Context *ctx) {
-  LOG(FATAL) << "Uniform Not Implemented";
+  LOG_FATAL("Uniform", DType, Lang);
 }
 
 // *********************************************************
@@ -372,52 +375,52 @@
 /// outurn the index of the element with the max value.
 template <typename DType, typename Lang>
 void Amax(const Tensor &in, size_t *out, Context *ctx) {
-  LOG(FATAL) << "Amax Not Implemented";
+  LOG_FATAL("Amax", DType, Lang);
 }
 
 /// outurn the index of the element with the min value.
 template <typename DType, typename Lang>
 void Amin(const Tensor &in, size_t *out, Context *ctx) {
-  LOG(FATAL) << "Amin Not Implemented";
+  LOG_FATAL("Amin", DType, Lang);
 }
 /// out = sum |x| for all x in in
 template <typename DType, typename Lang>
 void Asum(const Tensor &in, DType *out, Context *ctx) {
-  LOG(FATAL) << "Asum Not Implemented";
+  LOG_FATAL("Asum", DType, Lang);
 }
 
 /// out = alpha * in + out
 template <typename DType, typename Lang>
 void Axpy(const DType alpha, const Tensor &in, Tensor *out, Context *ctx) {
-  LOG(FATAL) << "Axpy Not Implemented";
+  LOG_FATAL("Axpy", DType, Lang);
 }
 
 /// out = alpha * in + out
 template <typename DType, typename Lang>
 void Axpy(const Tensor &alpha, const Tensor &in, Tensor *out, Context *ctx) {
-  LOG(FATAL) << "Axpy Not Implemented";
+  LOG_FATAL("Axpy Tensor alpha", DType, Lang);
 }
 
 /// out = ||in||_2^2, i.e, L2 norm.
 template <typename DType, typename Lang>
 void Nrm2(const Tensor &in, float *out, Context *ctx) {
-  LOG(FATAL) << "Nrm2 Not Implemented";
+  LOG_FATAL("Nrm2", DType, Lang);
 }
 
 /// out *= x
 template <typename DType, typename Lang>
 void Scale(const DType x, Tensor *out, Context *ctx) {
-  LOG(FATAL) << "Scale Not Implemented";
+  LOG_FATAL("Scale", DType, Lang);
 }
 
 /// inner product of array in1 and in2
 template <typename DType, typename Lang>
 void Dot(const Tensor &in1, const Tensor &in2, DType *out, Context *ctx) {
-  LOG(FATAL) << "Dot Not Implemented";
+  LOG_FATAL("Inner-product Dot", DType, Lang);
 }
 template <typename DType, typename Lang>
 void Dot(const Tensor &in1, const Tensor &in2, Tensor *out, Context *ctx) {
-  LOG(FATAL) << "Dot Not Implemented";
+  LOG_FATAL("Dot", DType, Lang);
 }
 
 /// out = alpha * A * v + beta * out.
@@ -425,7 +428,7 @@
 template <typename DType, typename Lang>
 void GEMV(const DType alpha, const Tensor &A, const Tensor &v, const DType beta,
           Tensor *out, Context *ctx) {
-  LOG(FATAL) << "GEMV Not Implemented";
+  LOG_FATAL("GEMV", DType, Lang);
 }
 
 /// multiply a matrix with a diagnoal matrix constructed using values from 'v'.
@@ -433,7 +436,7 @@
 template <typename DType, typename Lang>
 void DGMM(const bool side_right, const Tensor &M, const Tensor &v, Tensor *out,
           Context *ctx) {
-  LOG(FATAL) << "DGMM Not Implemented";
+  LOG_FATAL("DGMM", DType, Lang);
 }
 
 /// C = alpha * A * B + beta * C.
@@ -441,24 +444,24 @@
 template <typename DType, typename Lang>
 void GEMM(const DType alpha, const Tensor &A, const Tensor &B, const DType beta,
           Tensor *C, Context *ctx) {
-  LOG(FATAL) << "GEMM Not Implemented";
+  LOG_FATAL("GEMM", DType, Lang);
 }
 
 template <typename DType, typename Lang>
 void GEMMBatched(const DType alpha, const Tensor &A, const Tensor &B,
                  const DType beta, Tensor *C, Context *ctx) {
-  LOG(FATAL) << "GEMM Batched Not Implemented";
+  LOG_FATAL("GEMMBatched", DType, Lang);
 }
 
 template <typename DType, typename Lang>
 void SoftMax(const Tensor &in, Tensor *out, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+  LOG_FATAL("SoftMax", DType, Lang);
 }
 
 template <typename DType, typename Lang>
 void SoftMaxBackward(const Tensor &in, Tensor *out, const Tensor &fdout,
                      Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+  LOG_FATAL("SoftMaxBackend", DType, Lang);
 }
 
 // yisen todo
@@ -466,68 +469,20 @@
 void ComputeCrossEntropy(bool int_target, const size_t batchsize,
                          const size_t dim, const Tensor &p, const Tensor &t,
                          Tensor *loss, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+  LOG_FATAL("ComputeCrossEntropy", DType, Lang);
 }
 
 template <typename DType, typename Lang>
 void SoftmaxCrossEntropyBwd(bool int_target, const size_t batchsize,
                             const size_t dim, const Tensor &p, const Tensor &t,
                             Tensor *grad, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+  LOG_FATAL("ComputeCrossEntropyBwd", DType, Lang);
 }
 
 template <typename DType, typename Lang>
 void RowMax(const Tensor &in, Tensor *out, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+  LOG_FATAL("RowMax", DType, Lang);
 }
-// **************************************
-// Matrix functions
-// **************************************
-/*
-/// Add the vector v to every column of A as the column of out
-template <typename DType, typename Lang>
-void AddCol(const size_t nrow, const size_t ncol, const Tensor &A, const Tensor
-&v,
-            Tensor *out, Context *ctx) {
-  LOG(FATAL) << "AddCol Not Implemented";
-}
-// TODO(wangwei) unify AddRow and AddCol.
-/// Add the vector v to every row of A as the row of out
-template <typename DType, typename Lang>
-void AddRow(const size_t nrow, const size_t ncol, const Tensor &A, const Tensor
-&v,
-            Tensor *out, Context *ctx) {
-  LOG(FATAL) << "AddRow Not Implemented";
-}
-/// outer-product.
-/// in1 and in2 are vectors of len m and n. out is matrix of shape m * n
-template <typename DType, typename Lang>
-void Outer(const size_t m, const size_t n, const Tensor &in1, const Tensor &in2,
-           Tensor *out, Context *ctx) {
-  LOG(FATAL) << "Outer Not Implemented";
-}
-
-/// Sum the columns of the in matrix into a vector
-template <typename DType, typename Lang>
-void SumColumns(const size_t nrow, const size_t ncol, const Tensor &in, Tensor
-*out,
-                Context *ctx) {
-  LOG(FATAL) << "SumColumns Not Implemented";
-}
-template <typename DType, typename Lang>
-void Set(const DType x, Tensor *out, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-// TODO(wangwei) unify SumRow and SumCol.
-/// Sum the rows of the in matrix into a vector
-template <typename DType, typename Lang>
-void SumRows(const size_t nrow, const size_t ncol, const Tensor &in, Tensor
-*out,
-             Context *ctx) {
-  LOG(FATAL) << "SumRows Not Implemented";
-}
-*/
 
 }  // namespace singa
 #endif  // SINGA_CORE_MATH_H_
diff --git a/src/core/tensor/tensor_math_cpp.h b/src/core/tensor/tensor_math_cpp.h
index 2c06f63..b3113ab 100644
--- a/src/core/tensor/tensor_math_cpp.h
+++ b/src/core/tensor/tensor_math_cpp.h
@@ -785,20 +785,6 @@
   *out = cblas_sasum(in.Size(), inPtr, 1);  // not using strided traversal
 }
 
-// template <>
-// void Axpy<float, lang::Cpp>(const float alpha,
-//                             const Tensor& in, Tensor *out, Context *ctx) {
-//   //check input tensor for strides first
-//   if (in.stride() == out->stride()) {
-//     const float *inPtr = static_cast<const float *>(in.block()->data());
-//     float *outPtr = static_cast<float *>(out->block()->mutable_data());
-//     cblas_saxpy(in.Size(), alpha, inPtr, 1, outPtr, 1);
-//   } else {
-//     //LOG(FATAL) << "Axpy, input and output strides do not match." ;
-//     EltwiseMult<float, lang::Cpp>(in, alpha, out, ctx);
-//   }
-// }
-
 template <>
 void Axpy<float, lang::Cpp>(const float alpha, const Tensor &in, Tensor *out,
                             Context *ctx) {
@@ -817,20 +803,25 @@
   }
 }
 
-// template <>
-// void Axpy<float, lang::Cpp>(const float alpha,
-//                            const Tensor& in, Tensor *out, Context *ctx) {
-//  //check input tensor for strides first
-//  if (in.stride() == out->stride()) {
-//    const float *inPtr = static_cast<const float *>(in.block()->data());
-//    float *outPtr = static_cast<float *>(out->block()->mutable_data());
-//    cblas_saxpy(in.Size(), alpha, inPtr, 1, outPtr, 1);
-//  } else if(out->transpose()) {
-//    LOG(FATAL) << "output is already transposed." ;
-//  } else {
-//    LOG(FATAL) << "Axpy, input and output strides do not match." ;
-//  }
-// }
+template <>
+void Axpy<float, lang::Cpp>(const Tensor &alpha, const Tensor &in, Tensor *out,
+                            Context *ctx) {
+  // check input tensor for strides first
+  const float *inPtr = static_cast<const float *>(in.block()->data());
+  float *outPtr = static_cast<float *>(out->block()->mutable_data());
+  const float a = *static_cast<const float*>(alpha.block()->data());
+
+  if (in.stride() == out->stride()) {
+    cblas_saxpy(in.Size(), a, inPtr, 1, outPtr, 1);
+  } else {
+    // LOG(FATAL) << "Axpy, input and output strides do not match." ;
+    Tensor t(in.shape(), in.device(), in.data_type());
+    EltwiseMult<float, lang::Cpp>(in, a, &t, ctx);
+    float *tPtr = static_cast<float *>(t.block()->mutable_data());
+    cblas_saxpy(in.Size(), 1, tPtr, 1, outPtr, 1);
+  }
+}
+
 
 template <>
 void Dot<float, lang::Cpp>(const Tensor &in1, const Tensor &in2, float *out,
@@ -1148,121 +1139,7 @@
   }
 }
 
-// =========Matrix operations ================================================
-/*
-template <>
-void SoftMax<float, lang::Cpp>(const Tensor &in, Tensor *out, Context* ctx) {
-  CHECK_LE(in.nDim(), 2u) << "Axis is required for SoftMax on multi dimemsional
-tensor";
-  out->CopyData(in);
-  size_t nrow = 1, ncol = in.Size(), size = ncol;
-  if (in.nDim() == 2u) {
-    nrow = in.shape(0);
-    ncol = size / nrow;
-    out->Reshape(Shape{nrow, ncol});
-  }
-  Tensor tmp = RowMax(*out);
-  SubColumn(tmp, out);
-  Exp(*out, out);
 
-  SumColumns(*out, &tmp);
-  DivColumn(tmp, out);
-  out->Reshape(in.shape());
-}
-
-template <>
-void AddCol<float, lang::Cpp>(const size_t nrow, const size_t ncol,
-                              const Tensor& A, const Tensor& v, Tensor* out,
-                              Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *APtr = static_cast<const float *>(A.data());
-  const float *vPtr = static_cast<const float *>(v.data());
-  for (size_t r = 0; r < nrow; r++) {
-    size_t offset = r * ncol;
-    for (size_t c = 0; c < ncol; c++) {
-      outPtr[offset + c] = APtr[offset + c] + vPtr[r];
-    }
-  }
-}
-
-template <>
-void AddRow<float, lang::Cpp>(const size_t nrow, const size_t ncol,
-                              const Tensor& A, const Tensor& v, Tensor* out,
-                              Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *APtr = static_cast<const float *>(A.data());
-  const float *vPtr = static_cast<const float *>(v.data());
-  for (size_t r = 0; r < nrow; r++) {
-    size_t offset = r * ncol;
-    for (size_t c = 0; c < ncol; c++) {
-      outPtr[offset + c] = APtr[offset + c] + vPtr[c];
-    }
-  }
-}
-template <>
-void Outer<float, lang::Cpp>(const size_t m, const size_t n, const Tensor& in1,
-                             const Tensor& in2, Tensor* out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *in1Ptr = static_cast<const float *>(in1.data());
-  const float *in2Ptr = static_cast<const float *>(in2.data());
-  for (size_t r = 0; r < m; r++) {
-    size_t offset = r * n;
-    for (size_t c = 0; c < n; c++) {
-      outPtr[offset + c] = in1Ptr[r] * in2Ptr[c];
-    }
-  }
-}
-template <>
-void Softmax<float, lang::Cpp>(const size_t nrow, const size_t ncol,
-                               const Tensor& in, Tensor* out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in.data());
-  float *bPtr = new float[ncol];
-  for (size_t r = 0; r < nrow; r++) {
-    size_t offset = r * ncol;
-    float denom = 0.f;
-    for (size_t c = 0; c < ncol; c++) {
-      bPtr[c] = exp(inPtr[offset + c]);
-      denom += bPtr[c];
-    }
-    for (size_t c = 0; c < ncol; c++) {
-      size_t idx = offset + c;
-      outPtr[idx] = bPtr[c] / denom;
-    }
-  }
-  delete bPtr;
-}
-
-template <>
-void SumColumns<float, lang::Cpp>(const size_t nrow, const size_t ncol,
-                                  const Tensor& in, Tensor* out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in.data());
-  for (size_t c = 0; c < ncol; c++) {
-    outPtr[c] = 0.f;
-  }
-  for (size_t r = 0; r < nrow; r++) {
-    size_t offset = r * ncol;
-    for (size_t c = 0; c < ncol; c++) {
-      outPtr[c] += inPtr[offset + c];
-    }
-  }
-}
-
-template <>
-void SumRows<float, lang::Cpp>(const size_t nrow, const size_t ncol,
-                               const Tensor& in, Tensor* out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in.data());
-  for (size_t r = 0; r < nrow; r++) {
-    size_t offset = r * ncol;
-    outPtr[r] = 0.f;
-    for (size_t c = 0; c < ncol; c++) {
-      outPtr[r] += inPtr[offset + c];
-    }
-  }
-}
-*/
 }  // namespace singa
 
 #endif  // SINGA_CORE_TENSOR_TENSOR_MATH_CPP_H_
diff --git a/test/python/test_tensor.py b/test/python/test_tensor.py
index 82d6d5c..f7044f1 100644
--- a/test/python/test_tensor.py
+++ b/test/python/test_tensor.py
@@ -19,6 +19,7 @@
 
 import math
 import unittest
+import random
 import numpy as np
 
 from singa import tensor
@@ -546,10 +547,10 @@
         x_val = np.random.randint(0, 10, (2, 3))
         x = tensor.from_numpy(x_val)
         x.to_device(dev)
-        scalar = np.random.random((1,))[0] * 100
+        scalar = random.random() * 100
         y = x + scalar
         self.assertEqual(y.dtype, tensor.float32)
-        np.testing.assert_array_almost_equal(tensor.to_numpy(y), x_val + scalar)
+        np.testing.assert_array_almost_equal(tensor.to_numpy(y), x_val + scalar, 5)
 
     @unittest.skipIf(not singa_api.USE_CUDA, 'CUDA is not enabled')
     def test_kint_float_gpu(self):
diff --git a/tool/conda/gpu/meta.yaml b/tool/conda/gpu/meta.yaml
index 58ef499..68df6b3 100644
--- a/tool/conda/gpu/meta.yaml
+++ b/tool/conda/gpu/meta.yaml
@@ -26,7 +26,7 @@
 
 requirements:
   run:
-    - singa {{ environ.get('GIT_DESCRIBE_TAG') | replace("-", ".") }} cudnn7.6.5_cuda10.0_py{{ py }}
+    - singa {{ environ.get('GIT_DESCRIBE_TAG') | replace("-", ".") }} cudnn7.6.5_cuda10.2_nccl2.6.4.1_mpich3.3.2_py{{ py }}
 
 build:
   number: 0
diff --git a/tool/conda/singa/meta.yaml b/tool/conda/singa/meta.yaml
index cface0d..5cacf3b 100644
--- a/tool/conda/singa/meta.yaml
+++ b/tool/conda/singa/meta.yaml
@@ -20,7 +20,7 @@
 # https://docs.conda.io/projects/conda-build/en/latest/resources/define-metadata.html#templating-with-jinja
 # {% set data = load_setup_py_data(setup_file='../../../python/singa/setup.py', from_recipe_dir=True) %}
 
-{% set version = "2.1.0.dev" %}
+{% set version = "3.2.0" %}
 
 package:
   name: singa
diff --git a/tool/docker/devel/centos6/cuda10/Dockerfile.manylinux2014 b/tool/docker/devel/centos6/cuda10/Dockerfile.manylinux2014
index d3aeaff..80ca788 100644
--- a/tool/docker/devel/centos6/cuda10/Dockerfile.manylinux2014
+++ b/tool/docker/devel/centos6/cuda10/Dockerfile.manylinux2014
@@ -119,6 +119,14 @@
     rm cudnn-10.2-linux-x64-v7.6.5.32.tgz && \
     ldconfig
 
+# install nccl for distributed training
+RUN git clone https://github.com/NVIDIA/nccl.git $HOME/nccl \
+    && cd $HOME/nccl \
+    && git checkout v2.4.8-1 \
+    && make BUILDDIR=/usr/local/ -j$(nproc) src.build \
+    && rm -rf /usr/local/obj \
+    && rm -rf $HOME/nccl
+
 # install cnmem to /usr/local/include  /usr/local/lib
 RUN git clone https://github.com/NVIDIA/cnmem.git cnmem \
     && cd cnmem && mkdir build && cd build && cmake .. && make && make install && cd ../.. && rm -rf cnmem