test/python/test_operation.py - singa - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # =============================================================================

 import unittest
 from builtins import str

 from singa import tensor
 from singa import singa_wrap as singa
 from singa import autograd
 from singa import layer
 from singa import singa_wrap
 from cuda_helper import gpu_dev, cpu_dev

 import numpy as np

 autograd.training = True

 CTensor = singa.Tensor

 dy = CTensor([2, 1, 2, 2])
 singa.Gaussian(0.0, 1.0, dy)


 def _tuple_to_string(t):
     lt = [str(x) for x in t]
     return '(' + ', '.join(lt) + ')'


 def axis_helper(y_shape, x_shape):
     """
     check which axes the x has been broadcasted
     Args:
         y_shape: the shape of result
         x_shape: the shape of x
     Return:
         a tuple refering the axes
     """
     res = []
     j = len(x_shape) - 1
     for i in range(len(y_shape) - 1, -1, -1):
         if j < 0 or x_shape[j] != y_shape[i]:
             res.append(i)
         j -= 1
     return tuple(res[::-1])


 def prepare_inputs_targets_for_rnn_test(dev):
     x_0 = np.random.random((2, 3)).astype(np.float32)
     x_1 = np.random.random((2, 3)).astype(np.float32)
     x_2 = np.random.random((2, 3)).astype(np.float32)

     h_0 = np.zeros((2, 2)).astype(np.float32)

     t_0 = np.random.random((2, 2)).astype(np.float32)
     t_1 = np.random.random((2, 2)).astype(np.float32)
     t_2 = np.random.random((2, 2)).astype(np.float32)

     x0 = tensor.Tensor(device=dev, data=x_0)
     x1 = tensor.Tensor(device=dev, data=x_1)
     x2 = tensor.Tensor(device=dev, data=x_2)

     h0 = tensor.Tensor(device=dev, data=h_0)

     t0 = tensor.Tensor(device=dev, data=t_0)
     t1 = tensor.Tensor(device=dev, data=t_1)
     t2 = tensor.Tensor(device=dev, data=t_2)

     inputs = [x0, x1, x2]
     targets = [t0, t1, t2]
     return inputs, targets, h0


 class TestPythonOperation(unittest.TestCase):

     def check_shape(self, actual, expect):
         self.assertEqual(
             actual, expect, 'shape mismatch, actual shape is %s'
             ' exepcted is %s' %
             (_tuple_to_string(actual), _tuple_to_string(expect)))

     def _greater_helper(self, dev):
         x0 = np.array([-0.9, -0.3, -0.1, 0.1, 0.5,
                        0.9]).reshape(3, 2).astype(np.float32)
         x1 = np.array([0, -0.3, 0, 0.1, 0, 0.9]).reshape(3,
                                                          2).astype(np.float32)
         y = np.greater(x0, x1)
         x0 = tensor.from_numpy(x0)
         x1 = tensor.from_numpy(x1)
         x0.to_device(dev)
         x1.to_device(dev)

         result = autograd.greater(x0, x1)

         np.testing.assert_array_almost_equal(tensor.to_numpy(result),
                                              y,
                                              decimal=5)

     def test_Greater_cpu(self):
         self._greater_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_Greater_gpu(self):
         self._greater_helper(gpu_dev)

     def _conv2d_helper(self, dev):
         # (out_channels, kernel_size)
         conv_0 = layer.Conv2d(1, 2)
         conv_without_bias_0 = layer.Conv2d(1, 2, bias=False)

         cpu_input_tensor = tensor.Tensor(shape=(2, 3, 3, 3), device=dev)
         cpu_input_tensor.gaussian(0.0, 1.0)

         dy = tensor.Tensor(shape=(2, 1, 2, 2), device=dev)
         dy.gaussian(0.0, 1.0)

         y = conv_0(cpu_input_tensor)  # PyTensor
         dx, dW, db = y.creator.backward(dy.data)  # CTensor

         self.check_shape(y.shape, (2, 1, 2, 2))
         self.check_shape(dx.shape(), (2, 3, 3, 3))
         self.check_shape(dW.shape(), (1, 3, 2, 2))
         self.check_shape(db.shape(), (1,))

         # forward without bias
         y_without_bias = conv_without_bias_0(cpu_input_tensor)
         self.check_shape(y_without_bias.shape, (2, 1, 2, 2))

     def test_conv2d_cpu(self):
         self._conv2d_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_conv2d_gpu(self):
         self._conv2d_helper(gpu_dev)

     def _conv_same_pad(self, dev, pad_mode, is_2d):
         if is_2d:
             x_h, w_h, k_h, p_h = 32, 4, 4, 1
         else:
             x_h, w_h, k_h, p_h = 1, 1, 1, 0

         x = tensor.Tensor(shape=(3, 3, x_h, 32), device=dev)
         x.gaussian(0.0, 1.0)

         # with the same padding, the padding should be 3
         # for SAME_UPPER, is (1, 1) + (0, 1)
         # for SAME_LOWER, is (1, 1) + (1, 0)

         kernel = (k_h, 4)
         padding = (p_h, 1)
         stride = (1, 1)
         group = 1
         bias = False
         out_channels = 3

         conv_0 = layer.Conv2d(out_channels,
                               kernel,
                               stride=stride,
                               group=group,
                               bias=bias,
                               pad_mode=pad_mode)

         y = conv_0(x)
         dy = np.ones((3, 3, x_h, 32), dtype=np.float32)
         dy = tensor.from_numpy(dy)
         dy.to_device(dev)

         dx, dW = y.creator.backward(dy.data)
         self.check_shape(y.shape, (3, 3, x_h, 32))
         self.check_shape(dx.shape(), (3, 3, x_h, 32))
         self.check_shape(dW.shape(), (3, 3, w_h, 4))

     def test_conv2d_same_pad_cpu(self):
         self._conv_same_pad(cpu_dev, "SAME_LOWER", True)
         self._conv_same_pad(cpu_dev, "SAME_UPPER", True)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_conv2d_same_pad_gpu(self):
         self._conv_same_pad(gpu_dev, "SAME_LOWER", True)
         self._conv_same_pad(gpu_dev, "SAME_UPPER", True)

     def test_conv1d_same_pad_cpu(self):
         self._conv_same_pad(cpu_dev, "SAME_LOWER", False)
         self._conv_same_pad(cpu_dev, "SAME_UPPER", False)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_conv1d_same_pad_gpu(self):
         self._conv_same_pad(gpu_dev, "SAME_LOWER", False)
         self._conv_same_pad(gpu_dev, "SAME_UPPER", False)

     def _pooling_same_pad(self, dev, pad_mode, is_2d):
         if is_2d:
             x_h, k_h, p_h = 32, 4, 1
         else:
             x_h, k_h, p_h = 1, 1, 0

         x = tensor.Tensor(shape=(3, 3, x_h, 32), device=dev)
         x.gaussian(0.0, 1.0)

         # with the same padding, the padding should be 3
         # for SAME_UPPER, is (1, 1) + (0, 1)
         # for SAME_LOWER, is (1, 1) + (1, 0)

         kernel = (k_h, 4)
         # we add 4 padding here and hope the conv and trim one padding then
         padding = (p_h, 1)
         stride = (1, 1)

         pooling = layer.Pooling2d(kernel, stride=stride, pad_mode=pad_mode)

         y = pooling(x)

         dy = np.ones((3, 3, x_h, 32), dtype=np.float32)
         dy = tensor.from_numpy(dy)
         dy.to_device(dev)

         dx = y.creator.backward(dy.data)
         self.check_shape(y.shape, (3, 3, x_h, 32))
         self.check_shape(dx.shape(), (3, 3, x_h, 32))

     def test_pooling2d_same_pad_cpu(self):
         self._pooling_same_pad(cpu_dev, "SAME_LOWER", True)
         self._pooling_same_pad(cpu_dev, "SAME_UPPER", True)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_pooling2d_same_pad_gpu(self):
         self._pooling_same_pad(gpu_dev, "SAME_LOWER", True)
         self._pooling_same_pad(gpu_dev, "SAME_UPPER", True)

     def test_pooling1d_same_pad_cpu(self):
         self._pooling_same_pad(cpu_dev, "SAME_LOWER", False)
         self._pooling_same_pad(cpu_dev, "SAME_UPPER", False)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_pooling1d_same_pad_gpu(self):
         self._pooling_same_pad(gpu_dev, "SAME_LOWER", False)
         self._pooling_same_pad(gpu_dev, "SAME_UPPER", False)

     def _sum_helper(self, dev):
         x = np.array([0.1, -1.0, 0.4, 4.0, -0.9,
                       9.0]).reshape(3, 2).astype(np.float32)
         x1 = np.array([0.1, 1.0, 0.4, 4.0, 0.9,
                        9.0]).reshape(3, 2).astype(np.float32)
         y = x + x1
         dy = np.ones((3, 2), dtype=np.float32)
         grad0 = dy
         grad1 = dy
         x = tensor.from_numpy(x)
         x1 = tensor.from_numpy(x1)
         dy = tensor.from_numpy(dy)
         x.to_device(dev)
         x1.to_device(dev)
         dy.to_device(dev)

         result = autograd.sum(x, x1)
         dx0, dx1 = result.creator.backward(dy.data)

         np.testing.assert_array_almost_equal(tensor.to_numpy(result),
                                              y,
                                              decimal=5)
         np.testing.assert_array_almost_equal(tensor.to_numpy(
             tensor.from_raw_tensor(dx0)),
                                              grad0,
                                              decimal=5)
         np.testing.assert_array_almost_equal(tensor.to_numpy(
             tensor.from_raw_tensor(dx1)),
                                              grad1,
                                              decimal=5)

     def test_sum_cpu(self):
         self._sum_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_sum_gpu(self):
         self._sum_helper(gpu_dev)

     def _SeparableConv2d_helper(self, dev):
         # SeparableConv2d(in_channels, out_channels, kernel_size)
         if dev == cpu_dev:
             in_channels = 1
         else:
             in_channels = 8
         separ_conv = layer.SeparableConv2d(16, 3, padding=1)

         x = np.random.random((10, in_channels, 28, 28)).astype(np.float32)
         x = tensor.Tensor(device=dev, data=x)

         y = separ_conv(x)
         self.check_shape(y.shape, (10, 16, 28, 28))

         y1 = separ_conv.depthwise_conv(x)
         y2 = separ_conv.point_conv(y1)

         dy1, dW_depth = y2.creator.backward(y2.data)
         dx, dW_spacial = y1.creator.backward(dy1)

         self.check_shape(y2.shape, (10, 16, 28, 28))

         self.check_shape(dy1.shape(), (10, in_channels, 28, 28))
         self.check_shape(dW_depth.shape(), (16, in_channels, 1, 1))

         self.check_shape(dx.shape(), (10, in_channels, 28, 28))
         self.check_shape(dW_spacial.shape(), (in_channels, 1, 3, 3))

     def test_SeparableConv2d_cpu(self):
         self._SeparableConv2d_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_SeparableConv2d_gpu(self):
         self._SeparableConv2d_helper(gpu_dev)

     def _batchnorm2d_helper(self, dev):
         batchnorm_0 = layer.BatchNorm2d(3)

         cpu_input_tensor = tensor.Tensor(shape=(2, 3, 3, 3), device=dev)
         cpu_input_tensor.gaussian(0.0, 1.0)

         dy = cpu_input_tensor.clone().data

         y = batchnorm_0(cpu_input_tensor)
         dx, ds, db = y.creator.backward(dy)

         self.check_shape(y.shape, (2, 3, 3, 3))
         self.check_shape(dx.shape(), (2, 3, 3, 3))
         self.check_shape(ds.shape(), (3,))
         self.check_shape(db.shape(), (3,))

     def test_batchnorm2d_cpu(self):
         self._batchnorm2d_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_batchnorm2d_gpu(self):
         self._batchnorm2d_helper(gpu_dev)

     def gradients_check(self,
                         func,
                         param,
                         autograds,
                         h=0.0005,
                         df=1,
                         dev=cpu_dev):
         # param: PyTensor
         # autograds: numpy_tensor
         p = tensor.to_numpy(param)
         it = np.nditer(p, flags=['multi_index'], op_flags=['readwrite'])
         while not it.finished:
             idx = it.multi_index
             diff = np.zeros_like(p)
             diff[idx] += h
             diff = tensor.from_numpy(diff)
             diff.to_device(dev)

             param += diff
             pos = func()
             pos = tensor.to_numpy(pos)

             param -= diff
             param -= diff
             neg = func()
             neg = tensor.to_numpy(neg)

             numerical_grad = np.sum((pos - neg) * df) / (2 * h)
             #print((autograds[idx] - numerical_grad)/numerical_grad)
             # threshold set as -5% to +5%
             #self.assertAlmostEqual((autograds[idx] - numerical_grad)/(numerical_grad+0.0000001), 0., places=1)
             self.assertAlmostEqual(autograds[idx] - numerical_grad,
                                    0.,
                                    places=2)

             it.iternext()

     def _vanillaRNN_gpu_tiny_ops_shape_check_helper(self, dev):
         # gradients shape check.
         inputs, target, h0 = prepare_inputs_targets_for_rnn_test(dev)
         rnn = layer.RNN(3, 2)

         hs, _ = rnn(inputs, h0)

         loss = autograd.softmax_cross_entropy(hs[0], target[0])
         for i in range(1, len(hs)):
             l = autograd.softmax_cross_entropy(hs[i], target[i])
             loss = autograd.add(loss, l)
         # d=autograd.infer_dependency(loss.creator)
         # print(d)
         for t, dt in autograd.backward(loss):
             self.check_shape(t.shape, dt.shape)

     def test_vanillaRNN_gpu_tiny_ops_shape_check_cpu(self):
         self._vanillaRNN_gpu_tiny_ops_shape_check_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_vanillaRNN_gpu_tiny_ops_shape_check_gpu(self):
         self._vanillaRNN_gpu_tiny_ops_shape_check_helper(gpu_dev)

     def _LSTM_gpu_tiny_ops_shape_check_helper(self, dev):
         # gradients shape check.
         inputs, target, h0 = prepare_inputs_targets_for_rnn_test(dev)
         c_0 = np.random.random((2, 1)).astype(np.float32)
         c0 = tensor.Tensor(device=dev, data=c_0)

         rnn = layer.LSTM(3, 2)

         hs, _, _ = rnn(inputs, (h0, c0))
         loss = autograd.softmax_cross_entropy(hs[0], target[0])

         for i in range(1, len(hs)):
             l = autograd.softmax_cross_entropy(hs[i], target[i])
             loss = autograd.add(loss, l)
         # d=autograd.infer_dependency(loss.creator)
         # print(d)
         for t, dt in autograd.backward(loss):
             self.check_shape(t.shape, dt.shape)

     def test_LSTM_gpu_tiny_ops_shape_check_cpu(self):
         self._LSTM_gpu_tiny_ops_shape_check_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_LSTM_gpu_tiny_ops_shape_check_gpu(self):
         self._LSTM_gpu_tiny_ops_shape_check_helper(gpu_dev)

     def _numerical_gradients_check_for_vallina_rnn_helper(self, dev):
         inputs, target, h0 = prepare_inputs_targets_for_rnn_test(dev)

         rnn = layer.RNN(3, 2)

         def valinna_rnn_forward():
             hs, _ = rnn(inputs, h0)

             loss = autograd.softmax_cross_entropy(hs[0], target[0])
             for i in range(1, len(hs)):
                 l = autograd.softmax_cross_entropy(hs[i], target[i])
                 loss = autograd.add(loss, l)
             #grads = autograd.gradients(loss)
             return loss

         loss1 = valinna_rnn_forward()
         auto_grads = autograd.gradients(loss1)

         params = rnn.get_params()
         for key, param in params.items():
             auto_grad = tensor.to_numpy(auto_grads[id(param)])

             self.gradients_check(valinna_rnn_forward, param, auto_grad, dev=dev)

     def _gradient_check_cudnn_rnn(self, mode="vanilla", dev=gpu_dev):
         seq = 10
         bs = 2
         fea = 10
         hid = 10
         x = np.random.random((seq, bs, fea)).astype(np.float32)
         tx = tensor.Tensor(device=dev, data=x)
         y = np.random.random((seq, bs, hid)).astype(np.float32)
         y = np.reshape(y, (-1, hid))
         ty = tensor.Tensor(device=dev, data=y)
         rnn = layer.CudnnRNN(hid, rnn_mode=mode, return_sequences=True)

         def vanilla_rnn_forward():
             out = rnn(tx)
             out = autograd.reshape(out, (-1, hid))
             loss = autograd.softmax_cross_entropy(out, ty)
             return loss

         loss = vanilla_rnn_forward()
         auto_grads = autograd.gradients(loss)

         params = rnn.get_params()
         for key, param in params.items():
             auto_grad = tensor.to_numpy(auto_grads[id(param)])
             self.gradients_check(vanilla_rnn_forward, param, auto_grad, dev=dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_gradient_check_cudnn_rnn_vanilla(self):
         self._gradient_check_cudnn_rnn(mode="vanilla", dev=gpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_gradient_check_cudnn_rnn_lstm(self):
         self._gradient_check_cudnn_rnn(mode="lstm", dev=gpu_dev)

     # Cos Sim Gradient Check
     def _gradient_check_cossim(self, dev=gpu_dev):
         bs = 2
         vec = 3
         ta = tensor.random((bs, vec), dev)
         tb = tensor.random((bs, vec), dev)
         # treat ta, tb as params
         ta.stores_grad = True
         tb.stores_grad = True
         ty = tensor.random((bs,), dev)

         def _forward():
             out = autograd.cossim(ta, tb)
             loss = autograd.mse_loss(out, ty)
             return loss

         loss = _forward()
         auto_grads = autograd.gradients(loss)

         params = {id(ta): ta, id(tb): tb}

         for key, param in params.items():
             auto_grad = tensor.to_numpy(auto_grads[id(param)])
             self.gradients_check(_forward, param, auto_grad, dev=dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_gradient_check_cossim_gpu(self):
         self._gradient_check_cossim(dev=gpu_dev)

     def test_gradient_check_cossim_cpu(self):
         self._gradient_check_cossim(dev=cpu_dev)

     def test_numerical_gradients_check_for_vallina_rnn_cpu(self):
         self._numerical_gradients_check_for_vallina_rnn_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_numerical_gradients_check_for_vallina_rnn_gpu(self):
         self._numerical_gradients_check_for_vallina_rnn_helper(gpu_dev)

     def _numerical_gradients_check_for_lstm_helper(self, dev):
         inputs, target, h0 = prepare_inputs_targets_for_rnn_test(dev)
         c_0 = np.zeros((2, 2)).astype(np.float32)
         c0 = tensor.Tensor(device=dev, data=c_0)

         rnn = layer.LSTM(3, 2)

         def lstm_forward():
             hs, _, _ = rnn(inputs, (h0, c0))

             loss = autograd.softmax_cross_entropy(hs[0], target[0])
             for i in range(1, len(hs)):
                 l = autograd.softmax_cross_entropy(hs[i], target[i])
                 loss = autograd.add(loss, l)
             return loss

         loss1 = lstm_forward()
         auto_grads = autograd.gradients(loss1)

         params = rnn.get_params()
         for key, param in params.items():
             auto_grad = tensor.to_numpy(auto_grads[id(param)])

             self.gradients_check(lstm_forward, param, auto_grad, dev=dev)

     def test_numerical_gradients_check_for_lstm_cpu(self):
         self._numerical_gradients_check_for_lstm_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_numerical_gradients_check_for_lstm_gpu(self):
         self._numerical_gradients_check_for_lstm_helper(gpu_dev)

     def _MeanSquareError_helper(self, dev):
         X = np.array([4.3, 5.4, 3.3, 3.6, 5.7,
                       6.0]).reshape(3, 2).astype(np.float32)
         T = np.array([4.4, 5.3, 3.2, 3.7, 5.4,
                       6.3]).reshape(3, 2).astype(np.float32)
         x = tensor.from_numpy(X)
         t = tensor.from_numpy(T)
         x.to_device(dev)
         t.to_device(dev)

         loss = autograd.mse_loss(x, t)
         dx = loss.creator.backward()

         loss_np = tensor.to_numpy(loss)[0]
         self.assertAlmostEqual(loss_np, 0.0366666, places=4)
         self.check_shape(dx.shape(), (3, 2))

     def test_MeanSquareError_cpu(self):
         self._MeanSquareError_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_MeanSquareError_gpu(self):
         self._MeanSquareError_helper(gpu_dev)

     def _Abs_helper(self, dev):
         X = np.array([0.8, -1.2, 3.3, -3.6, -0.5,
                       0.5]).reshape(3, 2).astype(np.float32)
         XT = np.array([0.8, 1.2, 3.3, 3.6, 0.5,
                        0.5]).reshape(3, 2).astype(np.float32)
         x = tensor.from_numpy(X)
         x.to_device(dev)

         result = autograd.abs(x)
         dx = result.creator.backward(x.data)

         np.testing.assert_array_almost_equal(tensor.to_numpy(result), XT)
         self.check_shape(dx.shape(), (3, 2))

     def test_Abs_cpu(self):
         self._Abs_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_Abs_gpu(self):
         self._Abs_helper(gpu_dev)

     def _Mean_helper(self, dev):
         x0 = np.array([-0.9, -0.3, -0.1, 0.1, 0.5,
                        0.9]).reshape(3, 2).astype(np.float32)
         x1 = np.array([0, -0.3, 0, 0.1, 0, 0.9]).reshape(3,
                                                          2).astype(np.float32)
         y = (x0 + x1) / 2
         grad = np.ones(x0.shape) / 2
         x0 = tensor.from_numpy(x0)
         x1 = tensor.from_numpy(x1)
         x0.to_device(dev)
         x1.to_device(dev)

         result = autograd.mean(x0, x1)
         dy = tensor.from_numpy(np.ones((3, 2)).astype(np.float32))
         dy.to_device(dev)
         dx0, dx1 = result.creator.backward(dy.data)

         np.testing.assert_array_almost_equal(tensor.to_numpy(result),
                                              y,
                                              decimal=5)
         np.testing.assert_array_almost_equal(tensor.to_numpy(
             tensor.from_raw_tensor(dx0)),
                                              grad,
                                              decimal=5)
         np.testing.assert_array_almost_equal(tensor.to_numpy(
             tensor.from_raw_tensor(dx1)),
                                              grad,
                                              decimal=5)

     def test_Mean_cpu(self):
         self._Mean_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_Mean_gpu(self):
         self._Mean_helper(gpu_dev)

     def _Exp_helper(self, dev):
         X = np.array([0.8, -1.2, 3.3, -3.6, -0.5,
                       0.5]).reshape(3, 2).astype(np.float32)
         XT = np.exp(X)
         x = tensor.from_numpy(X)
         x.to_device(dev)

         result = autograd.exp(x)
         dx = result.creator.backward(x.data)

         np.testing.assert_array_almost_equal(tensor.to_numpy(result),
                                              XT,
                                              decimal=5)
         self.check_shape(dx.shape(), (3, 2))

     def test_Exp_cpu(self):
         self._Exp_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_Exp_gpu(self):
         self._Exp_helper(gpu_dev)

     def _Identity_helper(self, dev):
         x = np.array([-0.9, -0.3, -0.1, 0.1, 0.5,
                       0.9]).reshape(3, 2).astype(np.float32)
         y = x.copy()
         grad = np.ones(x.shape)
         x = tensor.from_numpy(x)
         x.to_device(dev)

         result = autograd.identity(x)
         dy = tensor.from_numpy(np.ones((3, 2)).astype(np.float32))
         dy.to_device(dev)
         dx = result.creator.backward(dy.data)

         np.testing.assert_array_almost_equal(tensor.to_numpy(result),
                                              y,
                                              decimal=5)
         np.testing.assert_array_almost_equal(tensor.to_numpy(
             tensor.from_raw_tensor(dx)),
                                              grad,
                                              decimal=5)
         self.check_shape(dx.shape(), (3, 2))

     def test_Identity_cpu(self):
         self._Identity_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_Identity_gpu(self):
         self._Identity_helper(gpu_dev)

     def _LeakyRelu_helper(self, dev):
         X = np.array([0.8, -1.2, 3.3, -3.6, -0.5,
                       0.5]).reshape(3, 2).astype(np.float32)
         XT = np.array([0.8, -0.012, 3.3, -0.036, -0.005,
                        0.5]).reshape(3, 2).astype(np.float32)
         x = tensor.from_numpy(X)
         x.to_device(dev)

         result = autograd.leakyrelu(x)

         dx = result.creator.backward(x.data)

         np.testing.assert_array_almost_equal(tensor.to_numpy(result), XT)
         self.check_shape(dx.shape(), (3, 2))

     def test_LeakyRelu_cpu(self):
         self._LeakyRelu_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_LeakyRelu_gpu(self):
         self._LeakyRelu_helper(gpu_dev)

     def _Relu_helper(self, dev):
         X = np.array([0.8, -1.2, 3.3, -3.6, -0.5,
                       0.5]).reshape(3, 2).astype(np.float32)
         XT = np.maximum(X, 0)
         DY = np.ones((3, 2), dtype=np.float32)

         x = tensor.from_numpy(X)
         dy = tensor.from_numpy(DY)
         x.to_device(dev)
         dy.to_device(dev)

         result = autograd.relu(x)
         dx = result.creator.backward(dy.data)

         G = (X > 0).astype(np.float32)
         DX = np.multiply(G, DY)

         np.testing.assert_array_almost_equal(tensor.to_numpy(result),
                                              XT,
                                              decimal=5)
         np.testing.assert_array_almost_equal(tensor.to_numpy(
             tensor.from_raw_tensor(dx)),
                                              DX,
                                              decimal=5)

     def test_Relu_cpu(self):
         self._Relu_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_Relu_gpu(self):
         self._Relu_helper(gpu_dev)

     def _Cos_helper(self, dev):
         X = np.array([0.8, -1.2, 3.3, -3.6, -0.5,
                       0.5]).reshape(3, 2).astype(np.float32)
         XT = np.cos(X)
         DY = np.ones((3, 2), dtype=np.float32)

         x = tensor.from_numpy(X)
         dy = tensor.from_numpy(DY)
         x.to_device(dev)
         dy.to_device(dev)

         result = autograd.cos(x)
         dx = result.creator.backward(dy.data)

         G = -np.sin(X)
         DX = np.multiply(G, DY)

         np.testing.assert_array_almost_equal(tensor.to_numpy(result),
                                              XT,
                                              decimal=5)
         np.testing.assert_array_almost_equal(tensor.to_numpy(
             tensor.from_raw_tensor(dx)),
                                              DX,
                                              decimal=5)

     def test_Cos_cpu(self):
         self._Cos_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_Cos_gpu(self):
         self._Cos_helper(gpu_dev)

     def _Cosh_helper(self, dev):
         X = np.array([0.8, -1.2, 3.3, -3.6, -0.5,
                       0.5]).reshape(3, 2).astype(np.float32)
         XT = np.cosh(X)
         DY = np.ones((3, 2), dtype=np.float32)

         x = tensor.from_numpy(X)
         dy = tensor.from_numpy(DY)
         x.to_device(dev)
         dy.to_device(dev)

         result = autograd.cosh(x)
         dx = result.creator.backward(dy.data)

         G = np.sinh(X)
         DX = np.multiply(G, DY)

         np.testing.assert_array_almost_equal(tensor.to_numpy(result),
                                              XT,
                                              decimal=5)
         np.testing.assert_array_almost_equal(tensor.to_numpy(
             tensor.from_raw_tensor(dx)),
                                              DX,
                                              decimal=5)

     def test_Cosh_cpu(self):
         self._Cosh_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_Cosh_gpu(self):
         self._Cosh_helper(gpu_dev)

     def _Acos_helper(self, dev):
         X = np.array([-0.9, -0.3, -0.1, 0.1, 0.5,
                       0.9]).reshape(3, 2).astype(np.float32)
         XT = np.arccos(X)
         DY = np.ones((3, 2), dtype=np.float32)

         x = tensor.from_numpy(X)
         dy = tensor.from_numpy(DY)
         x.to_device(dev)
         dy.to_device(dev)

         result = autograd.acos(x)
         dx = result.creator.backward(dy.data)

         G = -1.0 / np.sqrt(1.0 - np.square(X))
         DX = np.multiply(G, DY)

         np.testing.assert_array_almost_equal(tensor.to_numpy(result),
                                              XT,
                                              decimal=5)
         np.testing.assert_array_almost_equal(tensor.to_numpy(
             tensor.from_raw_tensor(dx)),
                                              DX,
                                              decimal=5)

     def test_Acos_cpu(self):
         self._Acos_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_Acos_gpu(self):
         self._Acos_helper(gpu_dev)

     def _Acosh_helper(self, dev):
         X = np.array([1.1, 1.5, 1.9, 2.2, 2.5,
                       2.8]).reshape(3, 2).astype(np.float32)
         XT = np.arccosh(X)
         DY = np.ones((3, 2), dtype=np.float32)

         x = tensor.from_numpy(X)
         dy = tensor.from_numpy(DY)
         x.to_device(dev)
         dy.to_device(dev)

         result = autograd.acosh(x)
         dx = result.creator.backward(dy.data)

         G = 1.0 / np.multiply(np.sqrt(X - 1.0), np.sqrt(X + 1.0))
         DX = np.multiply(G, DY)

         np.testing.assert_array_almost_equal(tensor.to_numpy(result),
                                              XT,
                                              decimal=5)
         np.testing.assert_array_almost_equal(tensor.to_numpy(
             tensor.from_raw_tensor(dx)),
                                              DX,
                                              decimal=5)

     def test_Acosh_cpu(self):
         self._Acosh_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_Acosh_gpu(self):
         self._Acosh_helper(gpu_dev)

     def _Sin_helper(self, dev):
         X = np.array([0.8, -1.2, 3.3, -3.6, -0.5,
                       0.5]).reshape(3, 2).astype(np.float32)
         XT = np.sin(X)
         DY = np.ones((3, 2), dtype=np.float32)

         x = tensor.from_numpy(X)
         dy = tensor.from_numpy(DY)
         x.to_device(dev)
         dy.to_device(dev)

         result = autograd.sin(x)
         dx = result.creator.backward(dy.data)

         G = np.cos(X)
         DX = np.multiply(G, DY)

         np.testing.assert_array_almost_equal(tensor.to_numpy(result),
                                              XT,
                                              decimal=5)
         np.testing.assert_array_almost_equal(tensor.to_numpy(
             tensor.from_raw_tensor(dx)),
                                              DX,
                                              decimal=5)

     def test_Sin_cpu(self):
         self._Sin_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_Sin_gpu(self):
         self._Sin_helper(gpu_dev)

     def _Sinh_helper(self, dev):
         X = np.array([0.8, -1.2, 3.3, -3.6, -0.5,
                       0.5]).reshape(3, 2).astype(np.float32)
         XT = np.sinh(X)
         DY = np.ones((3, 2), dtype=np.float32)

         x = tensor.from_numpy(X)
         dy = tensor.from_numpy(DY)
         x.to_device(dev)
         dy.to_device(dev)

         result = autograd.sinh(x)
         dx = result.creator.backward(dy.data)

         G = np.cosh(X)
         DX = np.multiply(G, DY)

         np.testing.assert_array_almost_equal(tensor.to_numpy(result),
                                              XT,
                                              decimal=5)
         np.testing.assert_array_almost_equal(tensor.to_numpy(
             tensor.from_raw_tensor(dx)),
                                              DX,
                                              decimal=5)

     def test_Sinh_cpu(self):
         self._Sinh_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_Sinh_gpu(self):
         self._Sinh_helper(gpu_dev)

     def _Asin_helper(self, dev):
         X = np.array([-0.9, -0.3, -0.1, 0.1, 0.5,
                       0.9]).reshape(3, 2).astype(np.float32)
         XT = np.arcsin(X)
         DY = np.ones((3, 2), dtype=np.float32)

         x = tensor.from_numpy(X)
         dy = tensor.from_numpy(DY)
         x.to_device(dev)
         dy.to_device(dev)

         result = autograd.asin(x)
         dx = result.creator.backward(dy.data)

         G = 1.0 / np.sqrt(1.0 - np.square(X))
         DX = np.multiply(G, DY)

         np.testing.assert_array_almost_equal(tensor.to_numpy(result),
                                              XT,
                                              decimal=5)
         np.testing.assert_array_almost_equal(tensor.to_numpy(
             tensor.from_raw_tensor(dx)),
                                              DX,
                                              decimal=5)

     def test_Asin_cpu(self):
         self._Asin_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_Asin_gpu(self):
         self._Asin_helper(gpu_dev)

     def _Asinh_helper(self, dev):
         X = np.array([-0.9, -0.3, -0.1, 0.1, 0.5,
                       0.9]).reshape(3, 2).astype(np.float32)
         XT = np.arcsinh(X)
         DY = np.ones((3, 2), dtype=np.float32)

         x = tensor.from_numpy(X)
         dy = tensor.from_numpy(DY)
         x.to_device(dev)
         dy.to_device(dev)

         result = autograd.asinh(x)
         dx = result.creator.backward(dy.data)

         G = 1.0 / np.sqrt(np.square(X) + 1.0)
         DX = np.multiply(G, DY)

         np.testing.assert_array_almost_equal(tensor.to_numpy(result),
                                              XT,
                                              decimal=5)
         np.testing.assert_array_almost_equal(tensor.to_numpy(
             tensor.from_raw_tensor(dx)),
                                              DX,
                                              decimal=5)

     def test_Asinh_cpu(self):
         self._Asinh_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_Asinh_gpu(self):
         self._Asinh_helper(gpu_dev)

     def _Tan_helper(self, dev):
         X = np.array([0.8, -1.2, 3.3, -3.6, -0.5,
                       0.5]).reshape(3, 2).astype(np.float32)
         XT = np.tan(X)
         DY = np.ones((3, 2), dtype=np.float32)

         x = tensor.from_numpy(X)
         dy = tensor.from_numpy(DY)
         x.to_device(dev)
         dy.to_device(dev)

         result = autograd.tan(x)
         dx = result.creator.backward(dy.data)

         G = 1.0 / np.square(np.cos(X))
         DX = np.multiply(G, DY)

         np.testing.assert_array_almost_equal(tensor.to_numpy(result),
                                              XT,
                                              decimal=5)
         np.testing.assert_array_almost_equal(tensor.to_numpy(
             tensor.from_raw_tensor(dx)),
                                              DX,
                                              decimal=5)

     def test_Tan_cpu(self):
         self._Tan_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_Tan_gpu(self):
         self._Tan_helper(gpu_dev)

     def _Tanh_helper(self, dev):
         X = np.array([0.8, -1.2, 3.3, -3.6, -0.5,
                       0.5]).reshape(3, 2).astype(np.float32)
         XT = np.tanh(X)
         DY = np.ones((3, 2), dtype=np.float32)

         x = tensor.from_numpy(X)
         dy = tensor.from_numpy(DY)
         x.to_device(dev)
         dy.to_device(dev)

         result = autograd.tanh(x)
         dx = result.creator.backward(dy.data)

         G = 1.0 / np.square(np.cosh(X))
         DX = np.multiply(G, DY)

         np.testing.assert_array_almost_equal(tensor.to_numpy(result),
                                              XT,
                                              decimal=5)
         np.testing.assert_array_almost_equal(tensor.to_numpy(
             tensor.from_raw_tensor(dx)),
                                              DX,
                                              decimal=5)

     def test_Tanh_cpu(self):
         self._Tanh_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_Tanh_gpu(self):
         self._Tanh_helper(gpu_dev)

     def _Atan_helper(self, dev):
         X = np.array([-0.9, -0.3, -0.1, 0.1, 0.5,
                       0.9]).reshape(3, 2).astype(np.float32)
         XT = np.arctan(X)
         DY = np.ones((3, 2), dtype=np.float32)

         x = tensor.from_numpy(X)
         dy = tensor.from_numpy(DY)
         x.to_device(dev)
         dy.to_device(dev)

         result = autograd.atan(x)
         dx = result.creator.backward(dy.data)

         G = 1.0 / (1.0 + np.square(X))
         DX = np.multiply(G, DY)

         np.testing.assert_array_almost_equal(tensor.to_numpy(result),
                                              XT,
                                              decimal=5)
         np.testing.assert_array_almost_equal(tensor.to_numpy(
             tensor.from_raw_tensor(dx)),
                                              DX,
                                              decimal=5)

     def test_Atan_cpu(self):
         self._Atan_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_Atan_gpu(self):
         self._Atan_helper(gpu_dev)

     def _Atanh_helper(self, dev):
         X = np.array([-0.9, -0.3, -0.1, 0.1, 0.5,
                       0.9]).reshape(3, 2).astype(np.float32)
         XT = np.arctanh(X)
         DY = np.ones((3, 2), dtype=np.float32)

         x = tensor.from_numpy(X)
         dy = tensor.from_numpy(DY)
         x.to_device(dev)
         dy.to_device(dev)

         result = autograd.atanh(x)
         dx = result.creator.backward(dy.data)

         G = 1.0 / (1.0 - np.square(X))
         DX = np.multiply(G, DY)

         np.testing.assert_array_almost_equal(tensor.to_numpy(result),
                                              XT,
                                              decimal=5)
         np.testing.assert_array_almost_equal(tensor.to_numpy(
             tensor.from_raw_tensor(dx)),
                                              DX,
                                              decimal=5)

     def test_Atanh_cpu(self):
         self._Atanh_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_Atanh_gpu(self):
         self._Atanh_helper(gpu_dev)

     def _Less_helper(self, dev):
         x0 = np.array([-0.9, -0.3, -0.1, 0.1, 0.5,
                        0.9]).reshape(3, 2).astype(np.float32)
         x1 = np.array([0, -0.3, 0, 0.1, 0, 0.9]).reshape(3,
                                                          2).astype(np.float32)
         y = np.less(x0, x1)
         x0 = tensor.from_numpy(x0)
         x1 = tensor.from_numpy(x1)
         x0.to_device(dev)
         x1.to_device(dev)

         result = autograd.less(x0, x1)
         np.testing.assert_array_almost_equal(tensor.to_numpy(result),
                                              y,
                                              decimal=5)

     def test_Less_cpu(self):
         self._Less_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_Less_gpu(self):
         self._Less_helper(gpu_dev)

     def _Sub_helper(self, dev):
         X0 = np.array([7, -5, 0.2, -0.1, 0.3, 4]).reshape(3,
                                                           2).astype(np.float32)
         X1 = np.array([0.6, -1.3, 0.1, -0.1, 0.4,
                        0.3]).reshape(3, 2).astype(np.float32)
         XT = np.subtract(X0, X1)

         DY = np.ones((3, 2), dtype=np.float32)
         x0 = tensor.from_numpy(X0)
         x1 = tensor.from_numpy(X1)
         dy = tensor.from_numpy(DY)
         x0.to_device(dev)
         x1.to_device(dev)
         dy.to_device(dev)

         result = autograd.sub(x0, x1)
         dx0, dx1 = result.creator.backward(dy.data)

         DX0 = np.multiply(DY, 1.0)
         DX1 = np.multiply(DY, -1.0)

         np.testing.assert_array_almost_equal(tensor.to_numpy(result),
                                              XT,
                                              decimal=5)
         np.testing.assert_array_almost_equal(tensor.to_numpy(
             tensor.from_raw_tensor(dx0)),
                                              DX0,
                                              decimal=5)
         np.testing.assert_array_almost_equal(tensor.to_numpy(
             tensor.from_raw_tensor(dx1)),
                                              DX1,
                                              decimal=5)

     def test_Sub_cpu(self):
         self._Sub_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_Sub_gpu(self):
         self._Sub_helper(gpu_dev)

     def _Pow_helper(self, dev):
         X0 = np.array([7, 5, 0.2, 0.1, 0.3, 4]).reshape(3, 2).astype(np.float32)
         X1 = np.array([-1.0, 2.0, -1.0, -2.1, 1.0,
                        -2.0]).reshape(3, 2).astype(np.float32)
         XT = np.power(X0, X1)

         DY = np.ones((3, 2), dtype=np.float32)
         x0 = tensor.from_numpy(X0)
         x1 = tensor.from_numpy(X1)
         dy = tensor.from_numpy(DY)
         x0.to_device(dev)
         x1.to_device(dev)
         dy.to_device(dev)

         result = autograd.pow(x0, x1)
         dx0, dx1 = result.creator.backward(dy.data)

         G0 = np.multiply(X1, np.power(X0, (X1 - 1.0)))
         DX0 = np.multiply(G0, DY)
         G1 = np.multiply(np.power(X0, X1), np.log(X0))
         DX1 = np.multiply(G1, DY)

         np.testing.assert_array_almost_equal(tensor.to_numpy(result),
                                              XT,
                                              decimal=5)
         np.testing.assert_array_almost_equal(tensor.to_numpy(
             tensor.from_raw_tensor(dx0)),
                                              DX0,
                                              decimal=4)
         np.testing.assert_array_almost_equal(tensor.to_numpy(
             tensor.from_raw_tensor(dx1)),
                                              DX1,
                                              decimal=4)

     def test_Pow_cpu(self):
         self._Pow_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_Pow_gpu(self):
         self._Pow_helper(gpu_dev)

     def _SoftSign_helper(self, dev):
         # y = x / (1 + np.abs(x))
         X = np.array([0.8, -1.2, 3.3, -3.6, -0.5,
                       0.5]).reshape(3, 2).astype(np.float32)
         XT = X / (1 + np.absolute(X))
         DY = np.ones((3, 2), dtype=np.float32)

         x = tensor.from_numpy(X)
         dy = tensor.from_numpy(DY)
         x.to_device(dev)
         dy.to_device(dev)

         result = autograd.softsign(x)
         dx = result.creator.backward(dy.data)

         G = 1.0 / np.square(np.absolute(X) + 1.0)
         DX = np.multiply(G, DY)

         np.testing.assert_array_almost_equal(tensor.to_numpy(result),
                                              XT,
                                              decimal=5)
         np.testing.assert_array_almost_equal(tensor.to_numpy(
             tensor.from_raw_tensor(dx)),
                                              DX,
                                              decimal=5)

     def test_SoftSign_cpu(self):
         self._SoftSign_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_SoftSign_gpu(self):
         self._SoftSign_helper(gpu_dev)

     def _SoftPlus_helper(self, dev):
         #y = np.log(np.exp(x) + 1)
         X = np.array([0.8, -1.2, 3.3, -3.6, -0.5,
                       0.5]).reshape(3, 2).astype(np.float32)
         XT = np.log(np.exp(X) + 1)
         DY = np.ones((3, 2), dtype=np.float32)

         x = tensor.from_numpy(X)
         dy = tensor.from_numpy(DY)
         x.to_device(dev)
         dy.to_device(dev)

         result = autograd.softplus(x)
         dx = result.creator.backward(dy.data)

         G = 1.0 / (1.0 + np.exp(-X))
         DX = np.multiply(G, DY)

         np.testing.assert_array_almost_equal(tensor.to_numpy(result),
                                              XT,
                                              decimal=5)
         np.testing.assert_array_almost_equal(tensor.to_numpy(
             tensor.from_raw_tensor(dx)),
                                              DX,
                                              decimal=5)

     def test_SoftPlus_cpu(self):
         self._SoftPlus_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_SoftPlus_gpu(self):
         self._SoftPlus_helper(gpu_dev)

     def _unsqueeze_helper(self, dev):
         data = [0.1, -1.0, 0.4, 4.0, -0.9, 9.0]

         x = np.array(data).reshape(1, 2, 3).astype(np.float32)
         y = x.reshape(1, 1, 2, 3, 1)
         dy = np.ones((1, 1, 2, 3, 1), dtype=np.float32)
         grad = dy.reshape(1, 2, 3)

         x = tensor.from_numpy(x)
         dy = tensor.from_numpy(dy)
         x.to_device(dev)
         dy.to_device(dev)

         result = autograd.unsqueeze(x, [0, 4])
         dx = result.creator.backward(dy.data)

         np.testing.assert_array_almost_equal(tensor.to_numpy(result),
                                              y,
                                              decimal=5)
         np.testing.assert_array_almost_equal(tensor.to_numpy(
             tensor.from_raw_tensor(dx)),
                                              grad,
                                              decimal=5)

     def test_unsqueeze_cpu(self):
         self._unsqueeze_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_unsqueeze_gpu(self):
         self._unsqueeze_helper(gpu_dev)

     def _Sqrt_helper(self, dev):
         X = np.array([0.1, 1.0, 0.4, 4.0, 0.9,
                       9.0]).reshape(3, 2).astype(np.float32)
         XT = np.sqrt(X)
         DY = np.ones((3, 2), dtype=np.float32)

         x = tensor.from_numpy(X)
         dy = tensor.from_numpy(DY)
         x.to_device(dev)
         dy.to_device(dev)

         result = autograd.sqrt(x)
         dx = result.creator.backward(dy.data)

         G = 0.5 * np.power(X, -0.5)
         DX = np.multiply(G, DY)

         np.testing.assert_array_almost_equal(tensor.to_numpy(result),
                                              XT,
                                              decimal=5)
         np.testing.assert_array_almost_equal(tensor.to_numpy(
             tensor.from_raw_tensor(dx)),
                                              DX,
                                              decimal=5)

     def test_Sqrt_cpu(self):
         self._Sqrt_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_Sqrt_gpu(self):
         self._Sqrt_helper(gpu_dev)

     def _transpose_helper(self, dev):
         x = np.random.randn(3, 2, 1)
         y = x.transpose(1, 2, 0)
         dy = np.random.randn(*(y.shape))
         grad = dy.transpose((2, 0, 1))

         x = tensor.from_numpy(x)
         dy = tensor.from_numpy(dy)
         x.to_device(dev)
         dy.to_device(dev)

         result = autograd.transpose(x, (1, 2, 0))
         dx = result.creator.backward(dy.data)
         np.testing.assert_array_almost_equal(tensor.to_numpy(result),
                                              y,
                                              decimal=5)
         np.testing.assert_array_almost_equal(tensor.to_numpy(
             tensor.from_raw_tensor(dx)),
                                              grad,
                                              decimal=5)

     def test_transpose_cpu(self):
         self._transpose_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_transpose_gpu(self):
         self._transpose_helper(gpu_dev)

     def _Sign_helper(self, dev):
         X = np.array([0.8, -1.2, 3.3, -3.6, -0.5,
                       0.5]).reshape(3, 2).astype(np.float32)
         XT = np.sign(X)
         DY = np.ones((3, 2), dtype=np.float32)

         x = tensor.from_numpy(X)
         dy = tensor.from_numpy(DY)
         x.to_device(dev)
         dy.to_device(dev)
         result = autograd.sign(x)
         dx = result.creator.backward(dy.data)
         DX = np.multiply(DY, 0)

         np.testing.assert_array_almost_equal(tensor.to_numpy(result),
                                              XT,
                                              decimal=5)
         np.testing.assert_array_almost_equal(tensor.to_numpy(
             tensor.from_raw_tensor(dx)),
                                              DX,
                                              decimal=5)

     def test_Sign_cpu(self):
         self._Sign_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_Sign_gpu(self):
         self._Sign_helper(gpu_dev)

     def _Log_helper(self, dev):
         X = np.array([0.1, 1.0, 0.4, 1.4, 0.9,
                       2.0]).reshape(3, 2).astype(np.float32)
         XT = np.log(X)
         DY = np.ones((3, 2), dtype=np.float32)

         x = tensor.from_numpy(X)
         dy = tensor.from_numpy(DY)
         x.to_device(dev)
         dy.to_device(dev)
         result = autograd.log(x)
         dx = result.creator.backward(dy.data)
         #dx = 1/x
         G = 1.0 / X
         DX = np.multiply(G, DY)

         np.testing.assert_array_almost_equal(tensor.to_numpy(result),
                                              XT,
                                              decimal=5)
         np.testing.assert_array_almost_equal(tensor.to_numpy(
             tensor.from_raw_tensor(dx)),
                                              DX,
                                              decimal=5)

     def test_Log_cpu(self):
         self._Log_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_Log_gpu(self):
         self._Log_helper(gpu_dev)

     def _mul_helper(self, dev):
         x = np.array([0.1, -1.0, 0.4, 4.0, -0.9,
                       9.0]).reshape(3, 2).astype(np.float32)
         x1 = np.array([0.1, 1.0, 0.4, 4.0, 0.9,
                        9.0]).reshape(3, 2).astype(np.float32)
         y = x * x1
         dy = np.array([0.1, 1.0, 0.4, 4.0, 0.9,
                        9.0]).reshape(3, 2).astype(np.float32)
         grad0 = x1 * dy
         grad1 = x * dy

         x = tensor.from_numpy(x)
         slope = tensor.from_numpy(x1)
         dy = tensor.from_numpy(dy)
         x.to_device(dev)
         slope.to_device(dev)
         dy.to_device(dev)

         result = autograd.mul(x, slope)
         dx0, dx1 = result.creator.backward(dy.data)

         np.testing.assert_array_almost_equal(tensor.to_numpy(result),
                                              y,
                                              decimal=5)
         np.testing.assert_array_almost_equal(tensor.to_numpy(
             tensor.from_raw_tensor(dx0)),
                                              grad0,
                                              decimal=5)
         np.testing.assert_array_almost_equal(tensor.to_numpy(
             tensor.from_raw_tensor(dx1)),
                                              grad1,
                                              decimal=5)

     def test_mul_cpu(self):
         self._mul_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_mul_gpu(self):
         self._mul_helper(gpu_dev)

     def _reshape_helper(self, dev):
         x = np.array([0.1, -1.0, 0.4, 4.0, -0.9,
                       9.0]).reshape(3, 2).astype(np.float32)
         y = x.reshape(2, 3)
         dy = np.array([1, 2, 3, 4, 5, 6]).reshape(2, 3).astype(np.float32)
         grad = dy.reshape(3, 2)

         x = tensor.from_numpy(x)
         dy = tensor.from_numpy(dy)
         x.to_device(dev)
         dy.to_device(dev)

         result = autograd.reshape(x, (2, 3))
         dx = result.creator.backward(dy.data)

         np.testing.assert_array_almost_equal(tensor.to_numpy(result),
                                              y,
                                              decimal=5)
         np.testing.assert_array_almost_equal(tensor.to_numpy(
             tensor.from_raw_tensor(dx)),
                                              grad,
                                              decimal=5)

     def test_reshape_cpu(self):
         self._reshape_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_reshape_gpu(self):
         self._reshape_helper(gpu_dev)

     def _max_helper(self, dev):
         X0 = np.array([0.1, 0.2, 2.0, 0.0, 0.1,
                        0.2]).reshape(3, 2).astype(np.float32)
         X1 = np.array([1.0, 2.0, 1.0, 2.1, 0.0,
                        2.0]).reshape(3, 2).astype(np.float32)
         XT = np.maximum(X0, X1)

         DY = np.ones((3, 2), dtype=np.float32)
         x0 = tensor.from_numpy(X0)
         x1 = tensor.from_numpy(X1)
         dy = tensor.from_numpy(DY)
         x0.to_device(dev)
         x1.to_device(dev)
         dy.to_device(dev)

         result = autograd.max(x0, x1)
         dx0, dx1 = result.creator.backward(dy.data)

         G = np.subtract(X0, X1)
         DX0 = np.where(G > 0, 1, G * 0)
         DX1 = np.where(G < 0, 1, G * 0)

         np.testing.assert_array_almost_equal(tensor.to_numpy(result),
                                              XT,
                                              decimal=5)
         np.testing.assert_array_almost_equal(tensor.to_numpy(
             tensor.from_raw_tensor(dx0)),
                                              DX0,
                                              decimal=5)
         np.testing.assert_array_almost_equal(tensor.to_numpy(
             tensor.from_raw_tensor(dx1)),
                                              DX1,
                                              decimal=5)

     def test_max_cpu(self):
         self._max_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_max_gpu(self):
         self._max_helper(gpu_dev)

     def _max_3inputs_helper(self, dev):
         data_0 = np.array([3, 2, 1]).astype(np.float32)
         data_1 = np.array([1, 4, 4]).astype(np.float32)
         data_2 = np.array([2, 5, 3]).astype(np.float32)
         XT = np.array([3, 5, 4]).astype(np.float32)

         DY = np.array([1, 1, 1]).astype(np.float32)
         x0 = tensor.from_numpy(data_0)
         x1 = tensor.from_numpy(data_1)
         x2 = tensor.from_numpy(data_2)
         dy = tensor.from_numpy(DY)
         x0.to_device(dev)
         x1.to_device(dev)
         x2.to_device(dev)
         dy.to_device(dev)

         result = autograd.max(x0, x1, x2)
         dx0, dx1, dx2 = result.creator.backward(dy.data)

         DX0 = np.array([1, 0, 0]).astype(np.float32)
         DX1 = np.array([0, 0, 1]).astype(np.float32)
         DX2 = np.array([0, 1, 0]).astype(np.float32)

         np.testing.assert_array_almost_equal(tensor.to_numpy(result),
                                              XT,
                                              decimal=5)
         np.testing.assert_array_almost_equal(tensor.to_numpy(
             tensor.from_raw_tensor(dx0)),
                                              DX0,
                                              decimal=5)
         np.testing.assert_array_almost_equal(tensor.to_numpy(
             tensor.from_raw_tensor(dx1)),
                                              DX1,
                                              decimal=5)
         np.testing.assert_array_almost_equal(tensor.to_numpy(
             tensor.from_raw_tensor(dx2)),
                                              DX2,
                                              decimal=5)

     def test_max_3inputs_cpu(self):
         self._max_3inputs_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_max_3inputs_gpu(self):
         self._max_3inputs_helper(gpu_dev)

     def _max_1inputs_helper(self, dev):
         data_0 = np.array([3, 2, 1]).astype(np.float32)
         XT = np.array([3, 2, 1]).astype(np.float32)

         DY = np.array([1, 1, 1]).astype(np.float32)
         x0 = tensor.from_numpy(data_0)
         dy = tensor.from_numpy(DY)
         x0.to_device(dev)
         dy.to_device(dev)

         result = autograd.max(x0)
         dx0 = result.creator.backward(dy.data)

         DX0 = np.array([1, 1, 1]).astype(np.float32)

         np.testing.assert_array_almost_equal(tensor.to_numpy(result),
                                              XT,
                                              decimal=5)

     def test_max_1inputs_cpu(self):
         self._max_1inputs_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_max_1inputs_gpu(self):
         self._max_1inputs_helper(gpu_dev)

     def _Div_helper(self, dev):
         X0 = np.array([7, -5, 0.2, -0.1, 0.3, 4]).reshape(3,
                                                           2).astype(np.float32)
         X1 = np.array([0.6, -1.3, 0.1, -0.1, 0.4,
                        0.3]).reshape(3, 2).astype(np.float32)
         XT = np.divide(X0, X1)

         DY = np.ones((3, 2), dtype=np.float32)
         x0 = tensor.from_numpy(X0)
         x1 = tensor.from_numpy(X1)
         dy = tensor.from_numpy(DY)
         x0.to_device(dev)
         x1.to_device(dev)
         dy.to_device(dev)

         result = autograd.div(x0, x1)
         dx0, dx1 = result.creator.backward(dy.data)

         G0 = 1.0 / X1
         DX0 = np.multiply(G0, DY)
         G1 = np.divide(-X0, np.square(X1))
         DX1 = np.multiply(G1, DY)

         np.testing.assert_array_almost_equal(tensor.to_numpy(result),
                                              XT,
                                              decimal=5)
         np.testing.assert_array_almost_equal(tensor.to_numpy(
             tensor.from_raw_tensor(dx0)),
                                              DX0,
                                              decimal=5)
         np.testing.assert_array_almost_equal(tensor.to_numpy(
             tensor.from_raw_tensor(dx1)),
                                              DX1,
                                              decimal=5)

     def test_Div_cpu(self):
         self._Div_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_Div_gpu(self):
         self._Div_helper(gpu_dev)

     def _squeeze_helper(self, dev):
         x = np.random.randn(3, 1, 2, 1, 1)
         y = x.reshape(3, 2)
         dy = np.random.randn(3, 2)
         grad = dy.reshape(3, 1, 2, 1, 1)

         x = tensor.from_numpy(x)
         dy = tensor.from_numpy(dy)
         x.to_device(dev)
         dy.to_device(dev)

         result = autograd.squeeze(x, [1, 3, 4])
         dx = result.creator.backward(dy.data)

         np.testing.assert_array_almost_equal(tensor.to_numpy(result),
                                              y,
                                              decimal=5)
         np.testing.assert_array_almost_equal(tensor.to_numpy(
             tensor.from_raw_tensor(dx)),
                                              grad,
                                              decimal=5)

     def test_squeeze_cpu(self):
         self._squeeze_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_squeeze_gpu(self):
         self._squeeze_helper(gpu_dev)

     def _shape_helper(self, dev):
         x = np.array([0.1, -1.0, 0.4, 4.0, -0.9,
                       9.0]).reshape(3, 2).astype(np.float32)
         y = list(x.shape)
         dy = np.ones((3, 2), dtype=np.float32)
         grad = list(dy.shape)

         x = tensor.from_numpy(x)
         dy = tensor.from_numpy(dy)
         x.to_device(dev)
         dy.to_device(dev)

         result = autograd.shape(x)
         dx = result.creator.backward(dy.data)

         np.testing.assert_array_almost_equal(tensor.to_numpy(result),
                                              y,
                                              decimal=5)
         np.testing.assert_array_almost_equal(dx, grad, decimal=5)

     def test_shape_cpu(self):
         self._shape_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_shape_gpu(self):
         self._shape_helper(gpu_dev)

     def _min_helper(self, dev):
         X0 = np.array([0.1, 0.2, 2.0, 0.0, 0.1,
                        0.2]).reshape(3, 2).astype(np.float32)
         X1 = np.array([1.0, 2.0, 1.0, 2.1, 0.0,
                        2.0]).reshape(3, 2).astype(np.float32)
         XT = np.minimum(X0, X1)

         DY = np.ones((3, 2), dtype=np.float32)
         x0 = tensor.from_numpy(X0)
         x1 = tensor.from_numpy(X1)
         dy = tensor.from_numpy(DY)
         x0.to_device(dev)
         x1.to_device(dev)
         dy.to_device(dev)

         result = autograd.min(x0, x1)
         dx0, dx1 = result.creator.backward(dy.data)

         G = np.subtract(X0, X1)
         DX0 = np.where(G < 0, 1, G * 0)
         DX1 = np.where(G > 0, 1, G * 0)

         np.testing.assert_array_almost_equal(tensor.to_numpy(result),
                                              XT,
                                              decimal=5)
         np.testing.assert_array_almost_equal(tensor.to_numpy(
             tensor.from_raw_tensor(dx0)),
                                              DX0,
                                              decimal=5)
         np.testing.assert_array_almost_equal(tensor.to_numpy(
             tensor.from_raw_tensor(dx1)),
                                              DX1,
                                              decimal=5)

     def test_min_cpu(self):
         self._min_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_min_gpu(self):
         self._min_helper(gpu_dev)

     def _min_3inputs_helper(self, dev):
         data_0 = np.array([3, 2, 1]).astype(np.float32)
         data_1 = np.array([1, 4, 4]).astype(np.float32)
         data_2 = np.array([2, 5, 0]).astype(np.float32)
         XT = np.array([1, 2, 0]).astype(np.float32)

         DY = np.array([1, 1, 1]).astype(np.float32)
         x0 = tensor.from_numpy(data_0)
         x1 = tensor.from_numpy(data_1)
         x2 = tensor.from_numpy(data_2)
         dy = tensor.from_numpy(DY)
         x0.to_device(dev)
         x1.to_device(dev)
         x2.to_device(dev)
         dy.to_device(dev)

         result = autograd.min(x0, x1, x2)
         dx0, dx1, dx2 = result.creator.backward(dy.data)

         DX0 = np.array([0, 1, 0]).astype(np.float32)
         DX1 = np.array([1, 0, 0]).astype(np.float32)
         DX2 = np.array([0, 0, 1]).astype(np.float32)

         np.testing.assert_array_almost_equal(tensor.to_numpy(result),
                                              XT,
                                              decimal=5)
         np.testing.assert_array_almost_equal(tensor.to_numpy(
             tensor.from_raw_tensor(dx0)),
                                              DX0,
                                              decimal=5)
         np.testing.assert_array_almost_equal(tensor.to_numpy(
             tensor.from_raw_tensor(dx1)),
                                              DX1,
                                              decimal=5)
         np.testing.assert_array_almost_equal(tensor.to_numpy(
             tensor.from_raw_tensor(dx2)),
                                              DX2,
                                              decimal=5)

     def test_min_3inputs_cpu(self):
         self._min_3inputs_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_min_3inputs_gpu(self):
         self._min_3inputs_helper(gpu_dev)

     def _min_1inputs_helper(self, dev):
         data_0 = np.array([3, 2, 1]).astype(np.float32)
         XT = np.array([3, 2, 1]).astype(np.float32)

         DY = np.array([1, 1, 1]).astype(np.float32)
         x0 = tensor.from_numpy(data_0)
         dy = tensor.from_numpy(DY)
         x0.to_device(dev)
         dy.to_device(dev)

         result = autograd.min(x0)
         dx0 = result.creator.backward(dy.data)

         DX0 = np.array([1, 1, 1]).astype(np.float32)

         np.testing.assert_array_almost_equal(tensor.to_numpy(result),
                                              XT,
                                              decimal=5)
         np.testing.assert_array_almost_equal(tensor.to_numpy(
             tensor.from_raw_tensor(dx0)),
                                              DX0,
                                              decimal=5)

     def test_min_1inputs_cpu(self):
         self._min_1inputs_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_min_1inputs_gpu(self):
         self._min_1inputs_helper(gpu_dev)

     def _HardSigmoid_helper(self, dev):
         x = np.random.randn(3, 2)
         #y = max(0, min(1, alpha * x + gamma))
         a = 0.2
         g = 0.5
         y = np.clip(x * 0.2 + 0.5, 0, 1)
         dy = np.random.randn(3, 2)
         grad = (0 < (np.clip(x * 0.2 + 0.5, 0, 1)) *
                 (np.clip(x * 0.2 + 0.5, 0, 1) < 1)) * 0.2 * dy
         x = tensor.from_numpy(x)
         dy = tensor.from_numpy(dy)
         x.to_device(dev)
         dy.to_device(dev)

         result = autograd.hardsigmoid(x, a, g)
         dx = result.creator.backward(dy.data)
         np.testing.assert_array_almost_equal(tensor.to_numpy(result),
                                              y,
                                              decimal=5)
         np.testing.assert_array_almost_equal(tensor.to_numpy(
             tensor.from_raw_tensor(dx)),
                                              grad,
                                              decimal=5)

     def test_HardSigmoid_cpu(self):
         self._HardSigmoid_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_HardSigmoid_gpu(self):
         self._HardSigmoid_helper(gpu_dev)

     def _prelu_helper(self, dev):
         x = np.random.randn(3, 2)
         slope = np.random.randn(3, 2)
         y = np.clip(x, 0, np.inf) + np.clip(x, -np.inf, 0) * slope
         dy = np.random.randn(3, 2)
         x0 = x.copy()
         x0[x0 > 0] = 1
         x0[x0 < 1] = 0
         grad0 = (x0 + (1 - x0) * slope) * dy
         grad1 = (1 - x0) * x * dy
         x = tensor.from_numpy(x)
         slope = tensor.from_numpy(slope)
         dy = tensor.from_numpy(dy)
         x.to_device(dev)
         slope.to_device(dev)
         dy.to_device(dev)
         result = autograd.prelu(x, slope)
         dx0, dx1 = result.creator.backward(dy.data)
         np.testing.assert_array_almost_equal(tensor.to_numpy(result),
                                              y,
                                              decimal=5)
         np.testing.assert_array_almost_equal(tensor.to_numpy(
             tensor.from_raw_tensor(dx0)),
                                              grad0,
                                              decimal=5)
         np.testing.assert_array_almost_equal(tensor.to_numpy(
             tensor.from_raw_tensor(dx1)),
                                              grad1,
                                              decimal=5)

     def test_prelu_cpu(self):
         self._prelu_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_prelu_gpu(self):
         self._prelu_helper(gpu_dev)

     def _SeLU_helper(self, dev):
         x = np.random.randn(3, 2)
         a = 0.2
         g = 0.3
         y = np.clip(x, 0,
                     np.inf) * g + (np.exp(np.clip(x, -np.inf, 0)) - 1) * a * g
         dy = np.random.randn(3, 2)
         grad = (np.exp(np.clip(x, -np.inf, 0))) * g
         grad[x <= 0] = grad[x <= 0] * a
         grad *= dy

         x = tensor.from_numpy(x)
         dy = tensor.from_numpy(dy)
         x.to_device(dev)
         dy.to_device(dev)
         result = autograd.selu(x, a, g)
         dx = result.creator.backward(dy.data)
         np.testing.assert_array_almost_equal(tensor.to_numpy(result),
                                              y,
                                              decimal=5)
         np.testing.assert_array_almost_equal(tensor.to_numpy(
             tensor.from_raw_tensor(dx)),
                                              grad,
                                              decimal=5)

     def test_SeLU_cpu(self):
         self._SeLU_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_SeLU_gpu(self):
         self._SeLU_helper(gpu_dev)

     def _and_helper(self, dev):
         x0 = np.array([0, -0.3, -0.1, 0.1, 0.5,
                        0.9]).reshape(3, 2).astype(np.float32)
         x1 = np.array([0, -0.3, 0, 0.1, 0.5, 0.9]).reshape(3,
                                                            2).astype(np.float32)

         y = np.logical_and(x0, x1)
         x0 = tensor.from_numpy(x0)
         x1 = tensor.from_numpy(x1)
         x0.to_device(dev)
         x1.to_device(dev)

         result = autograd._and(x0, x1)

         np.testing.assert_array_almost_equal(tensor.to_numpy(result),
                                              y,
                                              decimal=5)

     def test_and_cpu(self):
         self._and_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_and_gpu(self):
         self._and_helper(gpu_dev)

     def _or_helper(self, dev):
         x0 = np.array([1.0, 1.0, 2.0, -3.0, 0,
                        -7.0]).reshape(3, 2).astype(np.float32)
         x1 = np.array([-1.0, 0, 2.0, 4.0, 0,
                        -7.0]).reshape(3, 2).astype(np.float32)

         y = np.logical_or(x0, x1)
         x0 = tensor.from_numpy(x0)
         x1 = tensor.from_numpy(x1)
         x0.to_device(dev)
         x1.to_device(dev)

         result = autograd._or(x0, x1)

         np.testing.assert_array_almost_equal(tensor.to_numpy(result),
                                              y,
                                              decimal=5)

     def test_or_cpu(self):
         self._or_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_or_gpu(self):
         self._or_helper(gpu_dev)

     def _not_helper(self, dev):
         x = np.array([1.0, -1.0, 0, -0.1, 0,
                       -7.0]).reshape(3, 2).astype(np.float32)

         y = np.logical_not(x)
         x = tensor.from_numpy(x)
         x.to_device(dev)

         result = autograd._not(x)

         np.testing.assert_array_almost_equal(tensor.to_numpy(result),
                                              y,
                                              decimal=5)

     def test_not_cpu(self):
         self._not_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_not_gpu(self):
         self._not_helper(gpu_dev)

     def _xor_helper(self, dev):
         x0 = np.array([0, -0.3, -0.1, 0.1, 0.5,
                        9.0]).reshape(3, 2).astype(np.float32)
         x1 = np.array([0, -0.3, 0, 0.1, 0, 0.9]).reshape(3,
                                                          2).astype(np.float32)

         y = np.logical_xor(x0, x1)
         x0 = tensor.from_numpy(x0)
         x1 = tensor.from_numpy(x1)
         x0.to_device(dev)
         x1.to_device(dev)

         result = autograd._xor(x0, x1)

         np.testing.assert_array_almost_equal(tensor.to_numpy(result),
                                              y,
                                              decimal=5)

     def test_xor_cpu(self):
         self._xor_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_xor_gpu(self):
         self._xor_helper(gpu_dev)

     def _negative_helper(self, dev):
         X = np.array([0.1, 0, 0.4, 1. - 4, 0.9,
                       -2.0]).reshape(3, 2).astype(np.float32)
         XT = np.negative(X)
         DY = np.ones((3, 2), dtype=np.float32)

         x = tensor.from_numpy(X)
         dy = tensor.from_numpy(DY)
         x.to_device(dev)
         dy.to_device(dev)

         result = autograd.negative(x)
         dx = result.creator.backward(dy.data)
         DX = np.negative(DY)

         np.testing.assert_array_almost_equal(tensor.to_numpy(result),
                                              XT,
                                              decimal=5)
         np.testing.assert_array_almost_equal(tensor.to_numpy(
             tensor.from_raw_tensor(dx)),
                                              DX,
                                              decimal=5)

     def test_negative_cpu(self):
         self._negative_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_negative_gpu(self):
         self._negative_helper(gpu_dev)

     def _reciprocal_helper(self, dev):
         X = np.array([0.1, 0, 0.4, 1. - 4, 0.9,
                       -2.0]).reshape(3, 2).astype(np.float32)
         DY = np.ones((3, 2), dtype=np.float32)

         x = tensor.from_numpy(X)
         dy = tensor.from_numpy(DY)
         x.to_device(dev)
         dy.to_device(dev)

         result = autograd.reciprocal(x)
         dx = result.creator.backward(dy.data)
         #dy/dx = -1/x**2
         with np.errstate(divide='ignore'):
             XT = np.reciprocal(X)
             DX = -1 / np.square(X)

         np.testing.assert_array_almost_equal(tensor.to_numpy(result),
                                              XT,
                                              decimal=5)
         np.testing.assert_array_almost_equal(tensor.to_numpy(
             tensor.from_raw_tensor(dx)),
                                              DX,
                                              decimal=5)

     def test_reciprocal_cpu(self):
         self._reciprocal_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_reciprocal_gpu(self):
         self._reciprocal_helper(gpu_dev)

     def _and_broadcast_helper(self, dev):
         cases = [
             ([3, 4, 5], [5]),  # 3d vs 1d
             ([3, 4, 5], [4, 5]),  # 3d vs 2d
             ([3, 4, 5, 6], [5, 6]),  # 4d vs 2d
             ([3, 4, 5, 6], [4, 5, 6]),  # 4d vs 3d
             ([1, 4, 1, 6], [3, 1, 5, 6])  # 4d vs 4d
         ]
         for in1, in2 in cases:
             x = (np.random.randn(*in1) > 0).astype(np.float32)
             x1 = (np.random.randn(*in2) > 0).astype(np.float32)
             y = np.logical_and(x, x1)

             x = tensor.from_numpy(x)
             x1 = tensor.from_numpy(x1)
             x.to_device(dev)
             x1.to_device(dev)

             result = autograd._and(x, x1)
             np.testing.assert_array_almost_equal(tensor.to_numpy(result),
                                                  y,
                                                  decimal=5)

     def test_and_broadcast_cpu(self):
         self._and_broadcast_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_and_broadcast_gpu(self):
         self._and_broadcast_helper(gpu_dev)

     def _or_broadcast_helper(self, dev):
         cases = [
             ([3, 4, 5], [5]),  # 3d vs 1d
             ([3, 4, 5], [4, 5]),  # 3d vs 2d
             ([3, 4, 5, 6], [5, 6]),  # 4d vs 2d
             ([3, 4, 5, 6], [4, 5, 6]),  # 4d vs 3d
             ([1, 4, 1, 6], [3, 1, 5, 6])  # 4d vs 4d
         ]
         for in1, in2 in cases:
             x = (np.random.randn(*in1) > 0).astype(np.float32)
             x1 = (np.random.randn(*in2) > 0).astype(np.float32)
             y = np.logical_or(x, x1)

             x = tensor.from_numpy(x)
             x1 = tensor.from_numpy(x1)
             x.to_device(dev)
             x1.to_device(dev)

             result = autograd._or(x, x1)
             np.testing.assert_array_almost_equal(tensor.to_numpy(result),
                                                  y,
                                                  decimal=5)

     def test_or_broadcast_cpu(self):
         self._or_broadcast_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_or_broadcast_gpu(self):
         self._or_broadcast_helper(gpu_dev)

     def _xor_broadcast_helper(self, dev):
         cases = [
             ([3, 4, 5], [5]),  # 3d vs 1d
             ([3, 4, 5], [4, 5]),  # 3d vs 2d
             ([3, 4, 5, 6], [5, 6]),  # 4d vs 2d
             ([3, 4, 5, 6], [4, 5, 6]),  # 4d vs 3d
             ([1, 4, 1, 6], [3, 1, 5, 6])  # 4d vs 4d
         ]
         for in1, in2 in cases:
             x = (np.random.randn(*in1) > 0).astype(np.float32)
             x1 = (np.random.randn(*in2) > 0).astype(np.float32)
             y = np.logical_xor(x, x1)

             x = tensor.from_numpy(x)
             x1 = tensor.from_numpy(x1)
             x.to_device(dev)
             x1.to_device(dev)

             result = autograd._xor(x, x1)
             np.testing.assert_array_almost_equal(tensor.to_numpy(result),
                                                  y,
                                                  decimal=5)

     def test_xor_broadcast_cpu(self):
         self._xor_broadcast_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_xor_broadcast_gpu(self):
         self._xor_broadcast_helper(gpu_dev)

     def _greater_broadcast_helper(self, dev):
         cases = [
             ([3, 4, 5], [5]),  # 3d vs 1d
             ([3, 4, 5], [4, 5]),  # 3d vs 2d
             ([3, 4, 5, 6], [5, 6]),  # 4d vs 2d
             ([3, 4, 5, 6], [4, 5, 6]),  # 4d vs 3d
             ([1, 4, 1, 6], [3, 1, 5, 6])  # 4d vs 4d
         ]
         for in1, in2 in cases:
             x = np.random.randn(*in1).astype(np.float32)
             x1 = np.random.randn(*in2).astype(np.float32)
             y = np.greater(x, x1)

             x = tensor.from_numpy(x)
             x1 = tensor.from_numpy(x1)
             x.to_device(dev)
             x1.to_device(dev)

             result = autograd.greater(x, x1)
             np.testing.assert_array_almost_equal(tensor.to_numpy(result),
                                                  y,
                                                  decimal=5)

     def test_greater_broadcast_cpu(self):
         self._greater_broadcast_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_greater_broadcast_gpu(self):
         self._greater_broadcast_helper(gpu_dev)

     def _less_broadcast_helper(self, dev):
         dev = cpu_dev
         cases = [
             ([3, 4, 5], [5]),  # 3d vs 1d
             ([3, 4, 5], [4, 5]),  # 3d vs 2d
             ([3, 4, 5, 6], [5, 6]),  # 4d vs 2d
             ([3, 4, 5, 6], [4, 5, 6]),  # 4d vs 3d
             ([1, 4, 1, 6], [3, 1, 5, 6])  # 4d vs 4d
         ]
         for in1, in2 in cases:
             x = np.random.randn(*in1).astype(np.float32)
             x1 = np.random.randn(*in2).astype(np.float32)
             y = np.less(x, x1)

             x = tensor.from_numpy(x)
             x1 = tensor.from_numpy(x1)
             x.to_device(dev)
             x1.to_device(dev)

             result = autograd.less(x, x1)
             np.testing.assert_array_almost_equal(tensor.to_numpy(result),
                                                  y,
                                                  decimal=5)

     def test_less_broadcast_cpu(self):
         self._less_broadcast_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_less_broadcast_gpu(self):
         self._less_broadcast_helper(gpu_dev)

     def _add_broadcast_helper(self, dev):
         cases = [
             ([3, 4, 5], [5]),  # 3d vs 1d
             ([3, 4, 5], [4, 5]),  # 3d vs 2d
             ([3, 4, 5, 6], [5, 6]),  # 4d vs 2d
             ([3, 4, 5, 6], [4, 5, 6]),  # 4d vs 3d
             ([1, 4, 1, 6], [3, 1, 5, 6])  # 4d vs 4d
         ]
         for in1, in2 in cases:
             x = np.random.randn(*in1).astype(np.float32)
             x1 = np.random.randn(*in2).astype(np.float32)
             y = x + x1

             dy = np.random.randn(*y.shape)
             grad0 = np.sum(dy, axis=axis_helper(y.shape,
                                                 x.shape)).reshape(x.shape)
             grad1 = np.sum(dy, axis=axis_helper(y.shape,
                                                 x1.shape)).reshape(x1.shape)

             x = tensor.from_numpy(x)
             x1 = tensor.from_numpy(x1)
             dy = tensor.from_numpy(dy)
             x.to_device(dev)
             x1.to_device(dev)
             dy.to_device(dev)

             result = autograd.add(x, x1)
             dx0, dx1 = result.creator.backward(dy.data)
             np.testing.assert_array_almost_equal(tensor.to_numpy(result),
                                                  y,
                                                  decimal=5)
             np.testing.assert_array_almost_equal(tensor.to_numpy(
                 tensor.from_raw_tensor(dx0)),
                                                  grad0,
                                                  decimal=5)
             np.testing.assert_array_almost_equal(tensor.to_numpy(
                 tensor.from_raw_tensor(dx1)),
                                                  grad1,
                                                  decimal=5)

     def test_add_broadcast_cpu(self):
         self._add_broadcast_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_add_broadcast_gpu(self):
         self._add_broadcast_helper(gpu_dev)

     def _sub_broadcast_helper(self, dev):
         cases = [
             ([3, 4, 5], [5]),  # 3d vs 1d
             ([3, 4, 5], [4, 5]),  # 3d vs 2d
             ([3, 4, 5, 6], [5, 6]),  # 4d vs 2d
             ([3, 4, 5, 6], [4, 5, 6]),  # 4d vs 3d
             ([1, 4, 1, 6], [3, 1, 5, 6])  # 4d vs 4d
         ]
         for in1, in2 in cases:
             x = np.random.randn(*in1).astype(np.float32)
             x1 = np.random.randn(*in2).astype(np.float32)
             y = x - x1

             dy = np.random.randn(*y.shape)
             grad0 = np.sum(dy, axis=axis_helper(y.shape,
                                                 x.shape)).reshape(x.shape)
             grad1 = np.sum(-dy, axis=axis_helper(y.shape,
                                                  x1.shape)).reshape(x1.shape)

             x = tensor.from_numpy(x)
             x1 = tensor.from_numpy(x1)
             dy = tensor.from_numpy(dy)
             x.to_device(dev)
             x1.to_device(dev)
             dy.to_device(dev)

             result = autograd.sub(x, x1)
             dx0, dx1 = result.creator.backward(dy.data)
             np.testing.assert_array_almost_equal(tensor.to_numpy(result),
                                                  y,
                                                  decimal=5)
             np.testing.assert_array_almost_equal(tensor.to_numpy(
                 tensor.from_raw_tensor(dx0)),
                                                  grad0,
                                                  decimal=5)
             np.testing.assert_array_almost_equal(tensor.to_numpy(
                 tensor.from_raw_tensor(dx1)),
                                                  grad1,
                                                  decimal=5)

     def test_sub_broadcast_cpu(self):
         self._sub_broadcast_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_sub_broadcast_gpu(self):
         self._sub_broadcast_helper(gpu_dev)

     def _mul_broadcast_helper(self, dev):
         cases = [
             ([3, 4, 5], [5]),  # 3d vs 1d
             ([3, 4, 5], [4, 5]),  # 3d vs 2d
             ([3, 4, 5, 6], [5, 6]),  # 4d vs 2d
             ([3, 4, 5, 6], [4, 5, 6]),  # 4d vs 3d
             ([1, 4, 1, 6], [3, 1, 5, 6])  # 4d vs 4d
         ]
         for in1, in2 in cases:
             x = np.random.randn(*in1).astype(np.float32)
             x1 = np.random.randn(*in2).astype(np.float32)
             y = x * x1

             dy = np.random.randn(*y.shape)
             grad0 = np.sum(x1 * dy, axis=axis_helper(y.shape,
                                                      x.shape)).reshape(x.shape)
             grad1 = np.sum(x * dy, axis=axis_helper(y.shape,
                                                     x1.shape)).reshape(x1.shape)

             x = tensor.from_numpy(x)
             x1 = tensor.from_numpy(x1)
             dy = tensor.from_numpy(dy)
             x.to_device(dev)
             x1.to_device(dev)
             dy.to_device(dev)

             result = autograd.mul(x, x1)
             dx0, dx1 = result.creator.backward(dy.data)
             np.testing.assert_array_almost_equal(tensor.to_numpy(result),
                                                  y,
                                                  decimal=5)
             np.testing.assert_array_almost_equal(tensor.to_numpy(
                 tensor.from_raw_tensor(dx0)),
                                                  grad0,
                                                  decimal=5)
             np.testing.assert_array_almost_equal(tensor.to_numpy(
                 tensor.from_raw_tensor(dx1)),
                                                  grad1,
                                                  decimal=5)

     def test_mul_broadcast_cpu(self):
         self._mul_broadcast_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_mul_broadcast_gpu(self):
         self._mul_broadcast_helper(gpu_dev)

     def _div_broadcast_helper(self, dev):
         cases = [
             ([3, 4, 5], [5]),  # 3d vs 1d
             ([3, 4, 5], [4, 5]),  # 3d vs 2d
             ([3, 4, 5, 6], [5, 6]),  # 4d vs 2d
             ([3, 4, 5, 6], [4, 5, 6]),  # 4d vs 3d
             ([1, 4, 1, 6], [3, 1, 5, 6])  # 4d vs 4d
         ]
         for in1, in2 in cases:
             x = np.random.randn(*in1).astype(np.float32)
             x1 = np.random.randn(*in2).astype(np.float32) + 1.0
             y = x / x1

             dy = np.random.randn(*y.shape).astype(np.float32)
             grad0 = np.sum(np.power(x1, -1) * dy,
                            axis=axis_helper(y.shape, x.shape)).reshape(x.shape)
             grad1 = np.sum(x * -np.power(x1, -2) * dy,
                            axis=axis_helper(y.shape,
                                             x1.shape)).reshape(x1.shape)

             x = tensor.from_numpy(x)
             x1 = tensor.from_numpy(x1)
             dy = tensor.from_numpy(dy)
             x.to_device(dev)
             x1.to_device(dev)
             dy.to_device(dev)

             result = autograd.div(x, x1)
             dx0, dx1 = result.creator.backward(dy.data)
             # use realtive and total error instead of demical number
             np.testing.assert_allclose(tensor.to_numpy(result),
                                        y,
                                        rtol=1e-4,
                                        atol=1e-4)
             np.testing.assert_allclose(tensor.to_numpy(
                 tensor.from_raw_tensor(dx0)),
                                        grad0,
                                        rtol=1e-4,
                                        atol=1e-4)
             np.testing.assert_allclose(tensor.to_numpy(
                 tensor.from_raw_tensor(dx1)),
                                        grad1,
                                        rtol=1e-4,
                                        atol=1e-4)

     def test_div_broadcast_cpu(self):
         self._div_broadcast_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_div_broadcast_gpu(self):
         self._div_broadcast_helper(gpu_dev)

     def _pow_broadcast_helper(self, dev):
         cases = [
             ([3, 4, 5], [5]),  # 3d vs 1d
             ([3, 4, 5], [4, 5]),  # 3d vs 2d
             ([3, 4, 5, 6], [5, 6]),  # 4d vs 2d
             ([3, 4, 5, 6], [4, 5, 6]),  # 4d vs 3d
             ([1, 4, 1, 6], [3, 1, 5, 6])  # 4d vs 4d
         ]
         for in1, in2 in cases:
             x = np.random.randint(1, 10, size=in1).astype(np.float32)
             x1 = np.random.randint(1, 5, size=in2).astype(np.float32)
             y = np.power(x, x1).astype(np.float32)

             dy = np.random.randn(*y.shape).astype(np.float32)
             grad0 = np.sum(x1 * np.power(x, x1 - 1) * dy,
                            axis=axis_helper(y.shape, x.shape)).reshape(x.shape)
             grad1 = np.sum(np.power(x, x1) * np.log(x) * dy,
                            axis=axis_helper(y.shape,
                                             x1.shape)).reshape(x1.shape)

             x = tensor.from_numpy(x)
             x1 = tensor.from_numpy(x1)
             dy = tensor.from_numpy(dy)
             x.to_device(dev)
             x1.to_device(dev)
             dy.to_device(dev)

             result = autograd.pow(x, x1)
             dx0, dx1 = result.creator.backward(dy.data)
             np.testing.assert_array_almost_equal(tensor.to_numpy(result),
                                                  y,
                                                  decimal=2)
             np.testing.assert_array_almost_equal(tensor.to_numpy(
                 tensor.from_raw_tensor(dx0)),
                                                  grad0,
                                                  decimal=2)
             np.testing.assert_array_almost_equal(tensor.to_numpy(
                 tensor.from_raw_tensor(dx1)),
                                                  grad1,
                                                  decimal=2)

     def test_pow_broadcast_cpu(self):
         self._pow_broadcast_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_pow_broadcast_gpu(self):
         self._pow_broadcast_helper(gpu_dev)

     def _prelu_broadcast_helper(self, dev):
         cases = [
             ([3, 4, 5], [5]),  # 3d vs 1d
             ([3, 4, 5], [4, 5]),  # 3d vs 2d
             ([3, 4, 5, 6], [5, 6]),  # 4d vs 2d
             ([3, 4, 5, 6], [4, 5, 6]),  # 4d vs 3d
             ([1, 4, 1, 6], [3, 1, 5, 6])  # 4d vs 4d
         ]
         for in1, in2 in cases:
             x = np.random.randn(*in1).astype(np.float32)
             slope = np.random.randn(*in2).astype(np.float32)
             y = np.clip(x, 0, np.inf) + np.clip(x, -np.inf, 0) * slope

             dy = np.random.randn(*y.shape).astype(np.float32)
             x0 = x.copy()
             x0[x0 > 0] = 1
             x0[x0 < 1] = 0
             grad0 = np.sum((x0 + (1 - x0) * slope) * dy,
                            axis=axis_helper(y.shape, x.shape)).reshape(x.shape)
             grad1 = np.sum((1 - x0) * x * dy,
                            axis=axis_helper(y.shape,
                                             slope.shape)).reshape(slope.shape)

             x = tensor.from_numpy(x)
             slope = tensor.from_numpy(slope)
             dy = tensor.from_numpy(dy)
             x.to_device(dev)
             slope.to_device(dev)
             dy.to_device(dev)

             result = autograd.prelu(x, slope)
             dx0, dx1 = result.creator.backward(dy.data)
             np.testing.assert_array_almost_equal(tensor.to_numpy(result),
                                                  y,
                                                  decimal=5)
             np.testing.assert_array_almost_equal(tensor.to_numpy(
                 tensor.from_raw_tensor(dx0)),
                                                  grad0,
                                                  decimal=5)
             np.testing.assert_array_almost_equal(tensor.to_numpy(
                 tensor.from_raw_tensor(dx1)),
                                                  grad1,
                                                  decimal=5)

     def test_prelu_broadcast_cpu(self):
         self._prelu_broadcast_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_prelu_broadcast_gpu(self):
         self._prelu_broadcast_helper(gpu_dev)

     def _gemm_helper(self, dev):
         configs = [
             # alpha, beta, transA, transB, shapeA, shapeB, shapeC, shapeY
             [0.25, 0.35, 0, 0, (3, 4), (4, 5), (1, 5), (3, 5)],
             [0.25, 0.35, 0, 1, (3, 4), (5, 4), (1, 5), (3, 5)],
             [0.25, 0.35, 1, 0, (4, 3), (4, 5), (1, 5), (3, 5)],
             [0.25, 0.35, 1, 1, (4, 3), (5, 4), (1, 5), (3, 5)],
         ]
         for config in configs:
             alpha = config[0]
             beta = config[1]
             transA = config[2]
             transB = config[3]
             shapeA = config[4]
             shapeB = config[5]
             shapeC = config[6]
             shapeY = config[7]

             A = np.random.randn(*shapeA).astype(np.float32)
             DY = np.ones(shapeY, dtype=np.float32)

             if transB == 0:
                 out_features = shapeB[1]
             else:
                 out_features = shapeB[0]

             a = tensor.from_numpy(A)
             a.to_device(dev)
             dy = tensor.from_numpy(DY)
             dy.to_device(dev)

             gemm = layer.Gemm(out_features, alpha, beta, transA == 1,
                               transB == 1)
             result = gemm(a)

             params = gemm.get_params()
             B = tensor.to_numpy(params['W'])
             C = tensor.to_numpy(params['b'])

             da, db, dc = result.creator.backward(dy.data)

             # Y = alpha * A' * B' + beta * C
             _A = A if transA == 0 else A.T
             _B = B if transB == 0 else B.T
             C = C if C is not None else np.array(0)
             Y = alpha * np.dot(_A, _B) + beta * C

             DA = alpha * np.matmul(DY, _B.T)
             DA = DA if transA == 0 else DA.T
             DB = alpha * np.matmul(_A.T, DY)
             DB = DB if transB == 0 else DB.T
             DC = beta * np.sum(DY, axis=axis_helper(Y.shape, C.shape)).reshape(
                 C.shape)

             np.testing.assert_array_almost_equal(tensor.to_numpy(result),
                                                  Y,
                                                  decimal=5)
             np.testing.assert_array_almost_equal(tensor.to_numpy(
                 tensor.from_raw_tensor(da)),
                                                  DA,
                                                  decimal=5)
             np.testing.assert_array_almost_equal(tensor.to_numpy(
                 tensor.from_raw_tensor(db)),
                                                  DB,
                                                  decimal=5)
             np.testing.assert_array_almost_equal(tensor.to_numpy(
                 tensor.from_raw_tensor(dc)),
                                                  DC,
                                                  decimal=5)

     def test_gemm_cpu(self):
         self._gemm_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_gemm_gpu(self):
         self._gemm_helper(gpu_dev)

     def globalaveragepool_channel_first(self, dev):
         X = np.array([[[
             [1, 2, 3],
             [4, 5, 6],
             [7, 8, 9],
         ]]]).astype(np.float32)
         XT = np.array([[[[5]]]]).astype(np.float32)
         DY = np.ones((1, 1, 1, 1), dtype=np.float32)

         x = tensor.from_numpy(X)
         x.to_device(dev)
         dy = tensor.from_numpy(DY)
         dy.to_device(dev)

         result = autograd.globalaveragepool(x)
         dx = result.creator.backward(dy.data)

         DX = np.ones(X.shape, dtype=np.float32)
         DX = np.multiply(DX, DY) / np.prod(X.shape[2:])

         np.testing.assert_array_almost_equal(tensor.to_numpy(result),
                                              XT,
                                              decimal=5)
         np.testing.assert_array_almost_equal(tensor.to_numpy(
             tensor.from_raw_tensor(dx)),
                                              DX,
                                              decimal=5)

     def globalaveragepool_channel_last(self, dev):
         X = np.array([[
             [[1], [2], [3]],
             [[4], [5], [6]],
             [[7], [8], [9]],
         ]]).astype(np.float32)
         XT = np.array([[[[5]]]]).astype(np.float32)
         DY = np.ones((1, 1, 1, 1), dtype=np.float32)

         x = tensor.from_numpy(X)
         x.to_device(dev)
         dy = tensor.from_numpy(DY)
         dy.to_device(dev)

         result = autograd.globalaveragepool(x, 'channel_last')
         dx = result.creator.backward(dy.data)

         DX = np.ones(X.shape, dtype=np.float32)
         DX = np.multiply(DX, DY) / np.prod(X.shape[1:-1])

         np.testing.assert_array_almost_equal(tensor.to_numpy(result),
                                              XT,
                                              decimal=5)
         np.testing.assert_array_almost_equal(tensor.to_numpy(
             tensor.from_raw_tensor(dx)),
                                              DX,
                                              decimal=5)

     def test_globalaveragepool_cpu(self):
         self.globalaveragepool_channel_first(cpu_dev)
         self.globalaveragepool_channel_last(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_globalaveragepool_gpu(self):
         self.globalaveragepool_channel_first(gpu_dev)
         self.globalaveragepool_channel_last(gpu_dev)

     def constantOfShape_test(self, dev):
         # float_ones
         X = np.array([4, 3, 2]).astype(np.int64)
         x = tensor.from_numpy(X)
         x.to_device(dev)

         y = np.ones(X, dtype=np.float32)
         result = autograd.constant_of_shape(x, 1.0)

         np.testing.assert_array_almost_equal(tensor.to_numpy(result),
                                              y,
                                              decimal=5)
         # int32_zeros
         X = np.array([10, 6]).astype(np.int64)
         x = tensor.from_numpy(X)
         x.to_device(dev)

         y = np.ones(X, dtype=np.int32)
         result = autograd.constant_of_shape(x, 1)
         np.testing.assert_array_almost_equal(tensor.to_numpy(result),
                                              y,
                                              decimal=5)

     def test_constantOfShape_cpu(self):
         self.constantOfShape_test(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_constantOfShape_gpu(self):
         self.constantOfShape_test(gpu_dev)

     def dropout_test(self, dev):
         X = np.random.randn(3, 4, 5).astype(np.float32)
         dy = np.random.randn(3, 4, 5).astype(np.float32)

         x = tensor.from_numpy(X)
         dy = tensor.from_numpy(dy)
         x.to_device(dev)
         dy.to_device(dev)

         result = autograd.dropout(x, 0.5)
         dx = result.creator.backward(dy.data)
         self.check_shape(result.shape, (3, 4, 5))
         self.check_shape(dx.shape(), (3, 4, 5))

     def test_dropout_cpu(self):
         self.dropout_test(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_dropout_gpu(self):
         self.dropout_test(gpu_dev)

     def reduceSum_test(self, dev):
         shape = [3, 2, 2]
         cases = [(None, 1), ([1], 0), ([1], 1), ([-2], 1), ([1, 2], 1)]
         for axes, keepdims in cases:
             X = np.random.uniform(-10, 10, shape).astype(np.float32)
             _axes = tuple(axes) if axes is not None else None
             y = np.sum(X, axis=_axes, keepdims=keepdims == 1)
             dy = np.random.randn(*y.shape).astype(np.float32)

             x = tensor.from_numpy(X)
             dy = tensor.from_numpy(dy)
             x.to_device(dev)
             dy.to_device(dev)

             result = autograd.reduce_sum(x, axes, keepdims)
             dx = result.creator.backward(dy.data)

             np.testing.assert_array_almost_equal(tensor.to_numpy(result),
                                                  y,
                                                  decimal=5)
             self.check_shape(dx.shape(), tuple(shape))

     def test_reduceSum_cpu(self):
         self.reduceSum_test(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_reduceSum_gpu(self):
         self.reduceSum_test(gpu_dev)

     def reduceMean_test(self, dev):
         shape = [3, 2, 2]
         cases = [(None, 1), ([1], 0), ([1], 1), ([-2], 1), ([1, 2], 1)]
         for axes, keepdims in cases:
             X = np.random.uniform(-10, 10, shape).astype(np.float32)
             _axes = tuple(axes) if axes is not None else None
             y = np.mean(X, axis=_axes, keepdims=keepdims == 1)
             dy = np.random.randn(*y.shape).astype(np.float32)

             x = tensor.from_numpy(X)
             dy = tensor.from_numpy(dy)
             x.to_device(dev)
             dy.to_device(dev)

             result = autograd.reduce_mean(x, axes, keepdims)
             dx = result.creator.backward(dy.data)

             np.testing.assert_array_almost_equal(tensor.to_numpy(result),
                                                  y,
                                                  decimal=5)
             self.check_shape(dx.shape(), tuple(shape))

     def test_reduceMean_cpu(self):
         self.reduceMean_test(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_reduceMean_gpu(self):
         self.reduceMean_test(gpu_dev)

     def slice_test(self, dev):
         X = np.random.randn(20, 10, 5).astype(np.float32)
         indexes = np.array(range(20 * 10 * 5)).reshape(20, 10, 5)
         configs = [
             # starts, ends, axes, steps, y
             [[0, 0], [3, 10], [0, 1], [1, 1], X[0:3, 0:10],
              indexes[0:3, 0:10]],  # slice
             [[0, 0, 3], [20, 10, 4], None, None, X[:, :, 3:4],
              indexes[:, :, 3:4]],  # slice_default_axes
             [[1], [1000], [1], [1], X[:, 1:1000],
              indexes[:, 1:1000]],  # slice_end_out_of_bounds
             [[0], [-1], [1], [1], X[:, 0:-1],
              indexes[:, 0:-1]],  # slice_end_out_of_bounds
             [[20, 10, 4], [0, 0, 1], [0, 1, 2], [-1, -3, -2],
              X[20:0:-1, 10:0:-3, 4:1:-2], indexes[20:0:-1, 10:0:-3,
                                                   4:1:-2]],  # slice_neg_steps
             [[0, 0, 3], [20, 10, 4], [0, -2, -1], None, X[:, :, 3:4],
              indexes[:, :, 3:4]],  # slice_negative_axes
             # [[1000], [1000], [1], [1], X[:, 1000:1000], indexes[:, 1000:1000]], # slice_start_out_of_bounds # cannot support empty tensor
         ]
         for starts, ends, axes, steps, y, dx_idx in configs:
             dy = np.ones(y.shape).astype(np.float32)

             x = tensor.from_numpy(X)
             dy = tensor.from_numpy(dy)
             x.to_device(dev)
             dy.to_device(dev)

             result = autograd.slice(x, starts, ends, axes, steps)
             dx = result.creator.backward(dy.data)

             dx_idx = tuple(dx_idx.flatten().tolist())
             dX = np.array([
                 1. if i in dx_idx else 0. for i in range(20 * 10 * 5)
             ]).reshape(X.shape)

             np.testing.assert_array_almost_equal(tensor.to_numpy(result),
                                                  y,
                                                  decimal=5)
             np.testing.assert_array_almost_equal(tensor.to_numpy(
                 tensor.from_raw_tensor(dx)),
                                                  dX,
                                                  decimal=5)

     def test_slice_cpu(self):
         self.slice_test(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_slice_gpu(self):
         self.slice_test(gpu_dev)

     def ceil_test(self, dev):
         X = np.array([-1.5, 1.2]).astype(np.float32)
         DY = np.ones((2), dtype=np.float32)
         y = np.ceil(X)

         x = tensor.from_numpy(X)
         dy = tensor.from_numpy(DY)
         x.to_device(dev)
         dy.to_device(dev)

         result = autograd.ceil(x)
         dx = result.creator.backward(dy.data)
         DX = np.zeros((2), dtype=np.float32)

         np.testing.assert_array_almost_equal(tensor.to_numpy(result),
                                              y,
                                              decimal=5)
         np.testing.assert_array_almost_equal(tensor.to_numpy(
             tensor.from_raw_tensor(dx)),
                                              DX,
                                              decimal=5)

     def test_ceil_cpu(self):
         self.ceil_test(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_ceil_gpu(self):
         self.ceil_test(gpu_dev)

     def floor_test(self,dev):
         X = np.array([-1.9,1.2]).astype(np.float32)
         DY = np.ones((2),dtype=np.float32)
         y = np.floor(X)
         x = tensor.from_numpy(X)
         dy = tensor.from_numpy(DY)
         x.to_device(dev)
         dy.to_device(dev)

         result = autograd.floor(x)
         dx = result.creator.backward(dy.data)
         DX = np.zeros((2),dtype=np.float32)
         np.testing.assert_array_almost_equal(tensor.to_numpy(result),y,decimal=5)
         np.testing.assert_array_almost_equal(tensor.to_numpy(tensor.from_raw_tensor(dx)),DX,decimal=5)

     def test_floor_cpu(self):
         self.floor_test(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_floor_gpu(self):
         self.floor_test(gpu_dev)

     def _test_scatter_elements(self, dev):
         # testing witout axis
         data = np.zeros((3, 3), dtype=np.float32)
         indices = np.array([[1, 0, 2], [0, 2, 1]], dtype=np.int32)
         updates = np.array([[1.0, 1.1, 1.2], [2.0, 2.1, 2.2]], dtype=np.float32)
         output = np.array([[2.0, 1.1, 0.0], [1.0, 0.0, 2.2], [0.0, 2.1, 1.2]],
                           dtype=np.float32)

         data = tensor.from_numpy(data)
         indices = tensor.from_numpy(indices)
         updates = tensor.from_numpy(updates)
         data.to_device(dev)
         indices.to_device(dev)
         updates.to_device(dev)

         result = autograd.scatter_elements(data, indices, updates)
         dy = tensor.from_numpy(np.ones(data.shape, dtype=np.float32))
         dx = result.creator.backward(dy.data)
         np.testing.assert_almost_equal(tensor.to_numpy(result),
                                        output,
                                        decimal=5)
         self.check_shape(dx.shape(), data.shape)

         # testing with axis
         data = np.array([[1.0, 2.0, 3.0, 4.0, 5.0]], dtype=np.float32)
         indices = np.array([[1, 3]], dtype=np.int32)
         updates = np.array([[1.1, 2.1]], dtype=np.float32)
         output = np.array([[1.0, 1.1, 3.0, 2.1, 5.0]], dtype=np.float32)

         data = tensor.from_numpy(data)
         indices = tensor.from_numpy(indices)
         updates = tensor.from_numpy(updates)
         data.to_device(dev)
         indices.to_device(dev)
         updates.to_device(dev)

         result = autograd.scatter_elements(data, indices, updates, axis=1)
         dy = tensor.from_numpy(np.ones(data.shape, dtype=np.float32))
         dx = result.creator.backward(dy.data)
         np.testing.assert_almost_equal(tensor.to_numpy(result),
                                        output,
                                        decimal=5)
         self.check_shape(dx.shape(), data.shape)

         # testing with negative indices:
         data = np.array([[1.0, 2.0, 3.0, 4.0, 5.0]], dtype=np.float32)
         indices = np.array([[1, -3]], dtype=np.int64)
         updates = np.array([[1.1, 2.1]], dtype=np.float32)
         output = np.array([[1.0, 1.1, 2.1, 4.0, 5.0]], dtype=np.float32)

         data = tensor.from_numpy(data)
         indices = tensor.from_numpy(indices)
         updates = tensor.from_numpy(updates)
         data.to_device(dev)
         indices.to_device(dev)
         updates.to_device(dev)

         result = autograd.scatter_elements(data, indices, updates, axis=1)
         dy = tensor.from_numpy(np.ones(data.shape, dtype=np.float32))
         dx = result.creator.backward(dy.data)
         np.testing.assert_almost_equal(tensor.to_numpy(result),
                                        output,
                                        decimal=5)
         self.check_shape(dx.shape(), data.shape)

     def test_cpu_scatter_elements(self):
         self._test_scatter_elements(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_gpu_scatter_elements(self):
         self._test_scatter_elements(gpu_dev)

     def split_test(self, dev):
         X = np.array([1., 2., 3., 4., 5., 6.]).astype(np.float32)
         DY1 = np.ones((2), dtype=np.float32)
         DY2 = np.ones((4), dtype=np.float32)
         y = [
             np.array([1., 2.]).astype(np.float32),
             np.array([3., 4., 5., 6.]).astype(np.float32)
         ]

         x = tensor.from_numpy(X)
         dy1 = tensor.from_numpy(DY1)
         dy2 = tensor.from_numpy(DY2)
         x.to_device(dev)
         dy1.to_device(dev)
         dy2.to_device(dev)

         result = autograd.split(x, 0, (2, 4))
         dx = result[0].creator.backward(dy1.data, dy2.data)
         DX = np.ones((6), dtype=np.float32)

         for idx, _r in enumerate(result):
             np.testing.assert_array_almost_equal(tensor.to_numpy(_r),
                                                  y[idx],
                                                  decimal=5)
         np.testing.assert_array_almost_equal(tensor.to_numpy(
             tensor.from_raw_tensor(dx)),
                                              DX,
                                              decimal=5)

     def test_split_cpu(self):
         self.split_test(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_split_gpu(self):
         self.split_test(gpu_dev)

     def gather_test(self, dev):
         config = [([0, 1, 3], 0), ([0, 1, 3], 1), ([[0, 1], [1, 2], [2, 3]], 1),
                   ([0, -1, -2], 0)]  # (indices, axis)
         for indices, _axis in config:
             X = np.random.randn(5, 4, 3, 2).astype(np.float32)
             y = np.take(X, indices, axis=_axis)
             DY = np.ones(y.shape, dtype=np.float32)

             x = tensor.from_numpy(X)
             dy = tensor.from_numpy(DY)
             x.to_device(dev)
             dy.to_device(dev)

             result = autograd.gather(x, _axis, indices)
             dx = result.creator.backward(dy.data)

             np.testing.assert_array_almost_equal(tensor.to_numpy(result),
                                                  y,
                                                  decimal=5)
             self.check_shape(dx.shape(), tuple(X.shape))

     def test_gather_cpu(self):
         self.gather_test(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_gather_gpu(self):
         self.gather_test(gpu_dev)

     def tile_test(self, dev):
         config_repeats = [
             2,
             [2, 2],
             [2, 1, 2],
         ]
         for repeats in config_repeats:
             X = np.array([0, 1, 2]).astype(np.float32)
             y = np.tile(X, repeats)
             DY = np.copy(y)

             x = tensor.from_numpy(X)
             dy = tensor.from_numpy(DY)
             x.to_device(dev)
             dy.to_device(dev)

             result = autograd.tile(x, repeats)
             dx = result.creator.backward(dy.data)
             DX = np.multiply(X, np.prod(repeats))
             np.testing.assert_array_almost_equal(tensor.to_numpy(result),
                                                  y,
                                                  decimal=5)
             np.testing.assert_array_almost_equal(tensor.to_numpy(
                 tensor.from_raw_tensor(dx)),
                                                  DX,
                                                  decimal=5)

     def test_tile_cpu(self):
         self.tile_test(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_tile_gpu(self):
         self.tile_test(gpu_dev)

     def noneZero_test(self, dev):
         X = np.array([[1, 0], [1, 1]]).astype(np.float32)
         y = np.array((np.nonzero(X)))

         x = tensor.from_numpy(X)
         x.to_device(dev)

         result = autograd.nonzero(x)
         np.testing.assert_array_almost_equal(tensor.to_numpy(result),
                                              y,
                                              decimal=5)

     def test_noneZero_cpu(self):
         self.noneZero_test(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_noneZero_gpu(self):
         self.noneZero_test(gpu_dev)

     def cast_test(self, dev):
         config = [
             (np.float32, np.int32, tensor.int32),
             (np.int32, np.float32, tensor.float32),
         ]
         for t1, t2, t3 in config:
             X = np.array([[1, 0], [1, 1]]).astype(t1)
             y = np.array([[1, 0], [1, 1]]).astype(t2)

             x = tensor.from_numpy(X)
             x.to_device(dev)

             result = autograd.cast(x, t3)
             result_np = tensor.to_numpy(result)
             assert result_np.dtype == y.dtype, "type %s != %s." % (
                 result_np.dtype, y.dtype)
             np.testing.assert_array_almost_equal(result_np, y, decimal=5)

     def test_cast_cpu(self):
         self.cast_test(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_cast_gpu(self):
         self.cast_test(gpu_dev)

     def onehot_test(self, dev):

         def one_hot(indices, depth, axis=-1, dtype=np.float32):  # type: ignore
             ''' Compute one hot from indices at a specific axis '''
             values = np.asarray(indices)
             rank = len(values.shape)
             depth_range = np.arange(depth)
             if axis < 0:
                 axis += (rank + 1)
             ls = values.shape[0:axis]
             rs = values.shape[axis:rank]
             targets = np.reshape(depth_range, (1,) * len(ls) +
                                  depth_range.shape + (1,) * len(rs))
             values = np.reshape(np.mod(values, depth), ls + (1,) + rs)
             return np.asarray(targets == values, dtype=dtype)

         axisValue = 1
         on_value = 3
         off_value = 1
         output_type = np.float32
         indices = np.array([[1, 9], [2, 4]], dtype=np.float32)
         depth = np.array([10], dtype=np.float32)
         values = np.array([off_value, on_value], dtype=output_type)
         y = one_hot(indices, depth, axis=axisValue, dtype=output_type)
         y = y * (on_value - off_value) + off_value

         x = tensor.from_numpy(indices)
         x.to_device(dev)

         result = autograd.onehot(axisValue, x, depth, values)
         np.testing.assert_array_almost_equal(tensor.to_numpy(result),
                                              y,
                                              decimal=5)

     def test_onehot_cpu(self):
         self.onehot_test(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_onehot_gpu(self):
         self.onehot_test(gpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_cudnn_rnn_operation(self, dev=gpu_dev):
         # init params, inputs
         hidden_size = 7
         seq_length = 5
         batch_size = 6
         feature_size = 3
         directions = 2
         num_layers = 2

         for mode in [0, 1, 2, 3]:  # 0-relu, 1-tanh, 2-lstm, 3-gru
             x = tensor.Tensor(shape=(seq_length, batch_size, feature_size),
                               device=dev).gaussian(0, 1)
             hx = tensor.Tensor(shape=(num_layers * directions, batch_size,
                                       hidden_size),
                                device=dev).gaussian(0, 1)
             cx = tensor.Tensor(shape=(num_layers * directions, batch_size,
                                       hidden_size),
                                device=dev).gaussian(0, 1)
             dy = tensor.Tensor(shape=(seq_length, batch_size,
                                       directions * hidden_size),
                                device=dev).gaussian(0, 1)

             # init cudnn rnn op
             rnn_handle = singa.CudnnRNNHandle(x.data,
                                               hidden_size,
                                               mode,
                                               num_layers=num_layers,
                                               dropout=0.1,
                                               bidirectional=1)

             w = tensor.Tensor(shape=(rnn_handle.weights_size,),
                               device=dev).gaussian(0, 1)

             # return sequence, y shape = {seq, bs, hidden}
             # init operator/operation
             _rnn = autograd._RNN(rnn_handle, return_sequences=True)

             # forward
             y = _rnn(x, hx, cx, w)[0]
             assert y.shape == dy.shape
             # print(ys)

             # backward
             dx, dhx, dcx, dw = _rnn.backward(dy.data)

             # return no sequence, y shape = {bs, hidden}
             _rnn = autograd._RNN(rnn_handle, return_sequences=False)
             dy = tensor.Tensor(shape=(batch_size, directions * hidden_size),
                                device=dev).gaussian(0, 1)
             y = _rnn(x, hx, cx, w)[0]

             assert y.shape == dy.shape
             # backward
             dx, dhx, dcx, dw = _rnn.backward(dy.data)

     def cossim_helper(self, dev):
         A = np.random.randn(*[3, 10]).astype(np.float32)
         B = np.random.randn(*[3, 10]).astype(np.float32)

         a = tensor.from_numpy(A)
         a.to_device(dev)
         b = tensor.from_numpy(B)
         b.to_device(dev)

         DY = np.random.randn(3).astype(np.float32)
         dy = tensor.from_numpy(DY)
         dy.to_device(dev)

         y = autograd.cossim(a, b)
         da, db = y.creator.backward(dy.data)  # CTensor

         self.check_shape(y.shape, (3,))
         self.check_shape(da.shape(), (3, 10))
         self.check_shape(db.shape(), (3, 10))

     def test_cossim_cpu(self):
         self.cossim_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_cossim_gpu(self):
         self.cossim_helper(gpu_dev)

     def expand_helper(self, dev):
         shape = [3, 1]
         X = np.reshape(np.arange(1, np.prod(shape) + 1, dtype=np.float32),
                        shape)
         x = tensor.from_numpy(X)
         x.to_device(dev)

         # dim_changed
         new_shape = [2, 1, 6]
         y_t = X * np.ones(new_shape, dtype=np.float32)
         dy = tensor.from_numpy(y_t)
         dy.to_device(dev)
         y = autograd.expand(x, new_shape)
         dx = y.creator.backward(dy.data)
         np.testing.assert_array_almost_equal(tensor.to_numpy(y), y_t)
         self.check_shape(dx.shape(), tuple(shape))

         # dim_unchanged
         new_shape_2 = [3, 4]
         y_t2 = np.tile(X, 4)
         dy2 = tensor.from_numpy(y_t2)
         dy2.to_device(dev)
         y2 = autograd.expand(x, new_shape_2)
         dx2 = y2.creator.backward(dy2.data)
         np.testing.assert_array_almost_equal(tensor.to_numpy(y2), y_t2)
         self.check_shape(dx2.shape(), tuple(shape))

     def test_expand_cpu(self):
         self.expand_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_expand_gpu(self):
         self.expand_helper(gpu_dev)

     def pad_helper(self, dev):
         X = np.array([
             [1.0, 1.2],
             [2.3, 3.4],
             [4.5, 5.7],
         ]).astype(np.float32)
         Y1 = np.array([
             [0.0, 0.0, 1.0, 1.2],
             [0.0, 0.0, 2.3, 3.4],
             [0.0, 0.0, 4.5, 5.7],
         ],).astype(np.float32)
         Y2 = np.array([
             [1.0, 1.2, 1.0, 1.2],
             [2.3, 3.4, 2.3, 3.4],
             [4.5, 5.7, 4.5, 5.7],
         ],).astype(np.float32)
         Y3 = np.array([
             [1.0, 1.0, 1.0, 1.2],
             [2.3, 2.3, 2.3, 3.4],
             [4.5, 4.5, 4.5, 5.7],
         ],).astype(np.float32)

         x = tensor.from_numpy(X)
         x.to_device(dev)
         pads = [0, 2, 0, 0]

         DY = np.random.randn(3, 4).astype(np.float32)
         dy = tensor.from_numpy(DY)
         dy.to_device(dev)

         y1 = autograd.pad(x, "constant", pads)
         y2 = autograd.pad(x, "reflect", pads)
         y3 = autograd.pad(x, "edge", pads)
         dx1 = y1.creator.backward(dy.data)
         dx2 = y2.creator.backward(dy.data)
         dx3 = y3.creator.backward(dy.data)
         pad_width = []
         half_width = len(pads) // 2
         for i in range(half_width):
             pad_width += [[pads[i], pads[i + half_width]]]

         np.testing.assert_array_almost_equal(tensor.to_numpy(y1),
                                              np.pad(
                                                  X,
                                                  pad_width=pad_width,
                                                  mode="constant",
                                                  constant_values=0.,
                                              ),
                                              decimal=5)
         np.testing.assert_array_almost_equal(tensor.to_numpy(y2),
                                              np.pad(
                                                  X,
                                                  pad_width=pad_width,
                                                  mode="reflect",
                                              ),
                                              decimal=5)
         np.testing.assert_array_almost_equal(tensor.to_numpy(y3),
                                              np.pad(
                                                  X,
                                                  pad_width=pad_width,
                                                  mode="edge",
                                              ),
                                              decimal=5)
         self.check_shape(dx1.shape(), (3, 2))
         self.check_shape(dx2.shape(), (3, 2))
         self.check_shape(dx3.shape(), (3, 2))

     def test_pad_cpu(self):
         self.pad_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_pad_gpu(self):
         self.pad_helper(gpu_dev)

     def upsample_helper(self, dev):
         X = np.array([[[
             [1, 2],
             [3, 4],
         ]]], dtype=np.float32)
         x = tensor.from_numpy(X)
         x.to_device(dev)

         scales = np.array([1.0, 1.0, 2.0, 3.0], dtype=np.float32)
         y_t = np.array([[[
             [1, 1, 1, 2, 2, 2],
             [1, 1, 1, 2, 2, 2],
             [3, 3, 3, 4, 4, 4],
             [3, 3, 3, 4, 4, 4],
         ]]],
                        dtype=np.float32)
         dy = tensor.from_numpy(y_t)
         dy.to_device(dev)

         y = autograd.upsample(x, "nearest", scales)
         dx = y.creator.backward(dy.data)
         np.testing.assert_array_almost_equal(tensor.to_numpy(y), y_t)
         self.check_shape(dx.shape(), tuple(X.shape))

     def test_upsample_cpu(self):
         self.upsample_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_upsample_gpu(self):
         self.upsample_helper(gpu_dev)

     def depth_space_helper(self, dev):
         # (1, 8, 2, 3) input tensor
         X = np.array(
             [[[[0., 1., 2.], [3., 4., 5.]], [[9., 10., 11.], [12., 13., 14.]],
               [[18., 19., 20.], [21., 22., 23.]],
               [[27., 28., 29.], [30., 31., 32.]],
               [[36., 37., 38.], [39., 40., 41.]],
               [[45., 46., 47.], [48., 49., 50.]],
               [[54., 55., 56.], [57., 58., 59.]],
               [[63., 64., 65.], [66., 67., 68.]]]],
             dtype=np.float32)
         x = tensor.from_numpy(X)
         x.to_device(dev)

         # (1, 2, 4, 6) output tensor
         y_t = np.array(
             [[[[0., 18., 1., 19., 2., 20.], [36., 54., 37., 55., 38., 56.],
                [3., 21., 4., 22., 5., 23.], [39., 57., 40., 58., 41., 59.]],
               [[9., 27., 10., 28., 11., 29.], [45., 63., 46., 64., 47., 65.],
                [12., 30., 13., 31., 14., 32.], [48., 66., 49., 67., 50., 68.]]]
             ],
             dtype=np.float32)
         dy = tensor.from_numpy(y_t)
         dy.to_device(dev)
         y = autograd.depth_to_space(x, 2, "DCR")
         dx = y.creator.backward(dy.data)
         np.testing.assert_array_almost_equal(tensor.to_numpy(y), y_t)
         np.testing.assert_array_almost_equal(tensor.to_numpy(tensor.from_raw_tensor(dx)), X)

         y = autograd.space_to_depth(dy, 2, "DCR")
         dx = y.creator.backward(x.data)
         np.testing.assert_array_almost_equal(tensor.to_numpy(y), X)
         np.testing.assert_array_almost_equal(tensor.to_numpy(tensor.from_raw_tensor(dx)), y_t)

         y_t = np.array(
             [[[[0., 9., 1., 10., 2., 11.], [18., 27., 19., 28., 20., 29.],
                [3., 12., 4., 13., 5., 14.], [21., 30., 22., 31., 23., 32.]],
               [[36., 45., 37., 46., 38., 47.], [54., 63., 55., 64., 56., 65.],
                [39., 48., 40., 49., 41., 50.], [57., 66., 58., 67., 59., 68.]]]
             ],
             dtype=np.float32)
         dy = tensor.from_numpy(y_t)
         dy.to_device(dev)
         y = autograd.depth_to_space(x, 2, "CRD")
         dx = y.creator.backward(dy.data)
         np.testing.assert_array_almost_equal(tensor.to_numpy(y), y_t)
         np.testing.assert_array_almost_equal(tensor.to_numpy(tensor.from_raw_tensor(dx)), X)

         y = autograd.space_to_depth(dy, 2, "CRD")
         dx = y.creator.backward(x.data)
         np.testing.assert_array_almost_equal(tensor.to_numpy(y), X)
         np.testing.assert_array_almost_equal(tensor.to_numpy(tensor.from_raw_tensor(dx)), y_t)

     def test_depth_space_cpu(self):
         self.depth_space_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_depth_space_gpu(self):
         self.depth_space_helper(gpu_dev)

     def test_invalid_inputs(self, dev=cpu_dev):
         _1d = tensor.Tensor((10,), dev)
         _2d = tensor.Tensor((10, 10), dev)
         _3d = tensor.Tensor((10, 10, 10), dev)
         self.assertRaises(AssertionError, autograd.softmax_cross_entropy, _2d,
                           _3d)
         self.assertRaises(AssertionError, autograd.mse_loss, _2d, _3d)
         self.assertRaises(AssertionError, autograd.add_bias, _2d, _1d, 3)
         self.assertRaises(AssertionError, autograd.ranking_loss, _2d, _1d)

     def where_helper(self, dev):
         X = np.array([[1, 2], [3, 4]], dtype=np.float32)
         x = tensor.from_numpy(X)
         x.to_device(dev)

         X2 = np.array([[9, 8], [7, 6]], dtype=np.float32)
         x2 = tensor.from_numpy(X2)
         x2.to_device(dev)

         condition = [[True, False], [True, True]]
         y_t = np.where(condition, X, X2)
         dx1_t = np.array([[1, 0], [3, 4]], dtype=np.float32)
         dx2_t = np.array([[0, 8], [0, 0]], dtype=np.float32)
         dy = tensor.from_numpy(y_t)
         dy.to_device(dev)

         y = autograd.where(x, x2, condition)
         dx1, dx2 = y.creator.backward(dy.data)
         np.testing.assert_array_almost_equal(tensor.to_numpy(y), y_t)
         np.testing.assert_array_almost_equal(
             tensor.to_numpy(tensor.from_raw_tensor(dx1)), dx1_t)
         np.testing.assert_array_almost_equal(
             tensor.to_numpy(tensor.from_raw_tensor(dx2)), dx2_t)

     def test_where_cpu(self):
         self.where_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_where_gpu(self):
         self.where_helper(gpu_dev)

     def rounde_helper(self, dev):
         X = np.array([
             0.1, 0.5, 0.9, 1.2, 1.5, 1.8, 2.3, 2.5, 2.7, -1.1, -1.5, -1.9, -2.2,
             -2.5, -2.8
         ]).astype(np.float32)
         x = tensor.from_numpy(X)
         x.to_device(dev)

         y_t = np.array(
             [0., 0., 1., 1., 2., 2., 2., 2., 3., -1., -2., -2., -2., -2.,
              -3.]).astype(np.float32)
         dy = tensor.from_numpy(y_t)
         dy.to_device(dev)

         y = autograd.rounde(x)
         np.testing.assert_array_almost_equal(tensor.to_numpy(y), y_t)

     def test_rounde_cpu(self):
         self.rounde_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_rounde_gpu(self):
         self.rounde_helper(gpu_dev)

     def round_helper(self, dev):
         X = np.array([
             0.1, 0.5, 0.9, 1.2, 1.5, 1.8, 2.3, 2.5, 2.7, -1.1, -1.5, -1.9, -2.2,
             -2.5, -2.8
         ]).astype(np.float32)
         x = tensor.from_numpy(X)
         x.to_device(dev)

         y_t = np.array(
             [0., 1., 1., 1., 2., 2., 2., 3., 3., -1., -2., -2., -2., -3.,
              -3.]).astype(np.float32)
         dy = tensor.from_numpy(y_t)
         dy.to_device(dev)

         y = autograd.round(x)
         np.testing.assert_array_almost_equal(tensor.to_numpy(y), y_t)

     def test_round_cpu(self):
         self.round_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_round_gpu(self):
         self.round_helper(gpu_dev)

     def embedding_helper(self, dev):
         embedding = layer.Embedding(10, 3)

         X = np.array([[0, 1, 2, 3], [9, 8, 7, 6]])
         x = tensor.from_numpy(X)
         x.to_device(dev)

         dy = tensor.Tensor(shape=(2, 4, 3), device=dev)
         dy.gaussian(0.0, 1.0)

         y = embedding(x)  # PyTensor
         dx, dW = y.creator.backward(dy.data)  # CTensor

         self.check_shape(y.shape, (2, 4, 3))
         self.check_shape(dx.shape(), (2, 4))
         self.check_shape(dW.shape(), (10, 3))

     def test_embedding_cpu(self):
         self.embedding_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_embedding_gpu(self):
         self.embedding_helper(gpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def _cossim_value(self, dev=gpu_dev):
         # numpy val
         np.random.seed(0)
         bs = 1000
         vec_s = 1200
         a = np.random.random((bs, vec_s)).astype(np.float32)
         b = np.random.random((bs, vec_s)).astype(np.float32)
         dy = np.random.random((bs,)).astype(np.float32)

         # singa tensor
         ta = tensor.from_numpy(a)
         tb = tensor.from_numpy(b)
         tdy = tensor.from_numpy(dy)
         ta.to_device(dev)
         tb.to_device(dev)
         tdy.to_device(dev)

         # singa forward and backward
         ty = autograd.cossim(ta, tb)
         tda, tdb = ty.creator.backward(tdy.data)

         np_forward = list()
         for i in range(len(a)):
             a_norm = np.linalg.norm(a[i])
             b_norm = np.linalg.norm(b[i])
             ab_dot = np.dot(a[i], b[i])
             out = ab_dot / (a_norm * b_norm)
             np_forward.append(out)

         np_backward_a = list()
         np_backward_b = list()
         for i in range(len(a)):
             a_norm = np.linalg.norm(a[i])
             b_norm = np.linalg.norm(b[i])
             da = dy[i] * (b[i] / (a_norm * b_norm) - (np_forward[i] * a[i]) /
                           (a_norm * a_norm))
             db = dy[i] * (a[i] / (a_norm * b_norm) - (np_forward[i] * b[i]) /
                           (b_norm * b_norm))
             np_backward_a.append(da)
             np_backward_b.append(db)

         np.testing.assert_array_almost_equal(tensor.to_numpy(ty),
                                              np.array(np_forward))
         np.testing.assert_array_almost_equal(
             tensor.to_numpy(tensor.from_raw_tensor(tda)), np_backward_a)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_cossim_value_gpu(self):
         self._cossim_value(gpu_dev)

     def test_cossim_value_cpu(self):
         self._cossim_value(cpu_dev)

     def test_mse_loss_value(self, dev=cpu_dev):
         y = np.random.random((1000, 1200)).astype(np.float32)
         tar = np.random.random((1000, 1200)).astype(np.float32)
         # get singa value
         sy = tensor.from_numpy(y, dev)
         starget = tensor.from_numpy(tar, dev)
         sloss = autograd.mse_loss(sy, starget)
         sgrad = sloss.creator.backward()
         # get np value result
         np_loss = np.mean(np.square(tar - y))
         np_grad = -2 * (tar - y) / np.prod(tar.shape)
         # value check
         np.testing.assert_array_almost_equal(
             tensor.to_numpy(tensor.from_raw_tensor(sgrad)), np_grad)
         np.testing.assert_array_almost_equal(tensor.to_numpy(sloss), np_loss)

     def erf_helper(self, dev):
         X = np.array([
             0.1, 0.5, 0.9, 1.2, 1.5, 1.8, 2.3, 2.5, 2.7, -1.1, -1.5, -1.9, -2.2,
             -2.5, -2.8
         ]).astype(np.float32)
         x = tensor.from_numpy(X)
         x.to_device(dev)

         import math

         y_t = np.vectorize(math.erf)(X)
         dy = tensor.from_numpy(y_t)
         dy.to_device(dev)
         dx_t = 2. / np.pi**0.5 * np.exp(-np.power(y_t, 2))

         y = autograd.erf(x)
         dx = y.creator.backward(dy.data)
         np.testing.assert_array_almost_equal(tensor.to_numpy(y), y_t)
         np.testing.assert_array_almost_equal(
             tensor.to_numpy(tensor.from_raw_tensor(dx)), dx_t)

     def test_erf_cpu(self):
         self.erf_helper(cpu_dev)

     @unittest.skipIf(not singa_wrap.USE_CUDA, 'CUDA is not enabled')
     def test_erf_gpu(self):
         self.erf_helper(gpu_dev)


 if __name__ == '__main__':
     unittest.main()