tests/python/unittest/test_numpy_op.py - mxnet - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.

 # pylint: skip-file
 from __future__ import absolute_import
 from distutils.version import StrictVersion
 import sys
 import copy
 import itertools
 from mxnet.gluon.parameter import Parameter
 import numpy as onp
 import platform
 import mxnet as mx
 import scipy.stats as ss
 import scipy.special as scipy_special
 import pytest
 import mxnet.ndarray.numpy._internal as _npi
 from functools import reduce
 from packaging.version import parse
 from mxnet import np, npx
 from mxnet.gluon import HybridBlock
 from mxnet.base import MXNetError
 from mxnet.test_utils import same, assert_almost_equal, rand_shape_nd, rand_ndarray
 from mxnet.test_utils import check_numeric_gradient, use_np, collapse_sum_like, effective_dtype
 from mxnet.test_utils import new_matrix_with_real_eigvals_nd
 from mxnet.test_utils import new_sym_matrix_with_real_eigvals_nd
 from common import assertRaises, retry, xfail_when_nonstandard_decimal_separator
 import random
 from mxnet.test_utils import verify_generator, gen_buckets_probs_with_ppf
 from mxnet.numpy_op_signature import _get_builtin_op
 from mxnet.test_utils import is_op_runnable, has_tvm_ops, rand_shape_2d
 from mxnet.operator import get_all_registered_operators
 from common import assert_raises_cuda_not_satisfied
 from numpy.testing import assert_allclose


 @use_np
 @pytest.mark.parametrize('hybridize', [True, False])
 @pytest.mark.parametrize('dtype', [onp.float32, onp.float64])
 @pytest.mark.parametrize('a_shape,b_shape,axes', [
     ((3, 5), (5, 4), 1),
     ((3,), (3,), 1),
     ((3, 4, 5, 3, 2), (5, 3, 2, 1, 2), 3),
     ((3, 5, 4, 3, 2), (2, 3, 5, 1, 2), [[1, 3, 4], [2, 1, 0]]),
     ((3, 5, 4), (5, 4, 3), [[1, 0, 2], [0, 2, 1]]),
     ((3, 5, 4), (5, 3, 4), [[2, 0], [-1, -2]]),
     ((2, 2), (2, 2), 2),
     ((3, 5, 4), (5, ), [[-2], [0]]),
     ((3, 5, 4), (5, ), [[1], [0]]),
     ((2,), (2, 3), 1),
     ((3,), (3,), 0),
     ((2,), (2, 3), 0),
     ((3, 5, 4), (5, ), 0),
     ((2, 3, 4), (4, 3, 2), [[], []]),
     ((3, 0), (0, 5), 1),
     ((3, 0), (0, 4), [[1], [0]]),
     ((0, 3), (3, 5), 1),
     ((0, 3), (5, 0), [[0], [1]])
 ])
 def test_np_tensordot(a_shape, b_shape, axes, hybridize, dtype):
     class TestTensordot(HybridBlock):
         def __init__(self, axes):
             super(TestTensordot, self).__init__()
             self._axes = axes

         def forward(self, a, b):
             return np.tensordot(a, b, self._axes)

     def tensordot_backward(out_grad, a, b, axes=2):
         if (a.ndim < 1) or (b.ndim < 1):
             raise ValueError('An input is zero-dim')

         if onp.isscalar(axes):
             a_axes_summed = [i + a.ndim - axes for i in range(axes)]
             b_axes_summed = [i for i in range(axes)]
         else:
             if len(axes) != 2:
                 raise ValueError('Axes must consist of two arrays.')
             a_axes_summed, b_axes_summed = axes
             if onp.isscalar(a_axes_summed):
                 a_axes_summed = a_axes_summed,
             if onp.isscalar(b_axes_summed):
                 b_axes_summed = b_axes_summed,

             for i in range(len(a_axes_summed)):
                 a_axes_summed[i] = (a_axes_summed[i] + a.ndim) % a.ndim

             for i in range(len(b_axes_summed)):
                 b_axes_summed[i] = (b_axes_summed[i] + b.ndim) % b.ndim

         if len(a_axes_summed) != len(b_axes_summed):
             raise ValueError('Axes length mismatch')

         a_axes_remained = []
         for i in range(a.ndim):
             if not (i in a_axes_summed):
                 a_axes_remained.append(i)
         a_axes = a_axes_remained[:] + a_axes_summed[:]

         b_axes_remained = []
         for i in range(b.ndim):
             if not (i in b_axes_summed):
                 b_axes_remained.append(i)
         b_axes = b_axes_summed[:] + b_axes_remained[:]

         ad1 = onp.prod([a.shape[i] for i in a_axes_remained]) if len(a_axes_remained) > 0 else 1
         ad2 = onp.prod([a.shape[i] for i in a_axes_summed]) if len(a_axes_summed) > 0 else 1
         bd1 = onp.prod([b.shape[i] for i in b_axes_summed]) if len(b_axes_summed) > 0 else 1
         bd2 = onp.prod([b.shape[i] for i in b_axes_remained]) if len(b_axes_remained) > 0 else 1

         out_grad = out_grad.reshape((ad1, bd2))

         new_a = onp.transpose(a, a_axes)
         new_a_shape = new_a.shape[:]
         new_a = new_a.reshape((ad1, ad2))
         new_b = onp.transpose(b, b_axes)
         new_b_shape = new_b.shape[:]
         new_b = new_b.reshape((bd1, bd2))

         reverse_a_axes = [0 for i in a_axes]
         for i in range(len(a_axes)):
             reverse_a_axes[a_axes[i]] = i

         reverse_b_axes = [0 for i in b_axes]
         for i in range(len(b_axes)):
             reverse_b_axes[b_axes[i]] = i

         grad_b = onp.dot(new_a.T, out_grad).reshape(new_b_shape)
         grad_b = onp.transpose(grad_b, reverse_b_axes)
         grad_a = onp.dot(out_grad, new_b.T).reshape(new_a_shape)
         grad_a = onp.transpose(grad_a, reverse_a_axes)

         return [grad_a, grad_b]

     test_tensordot = TestTensordot(axes)
     if hybridize:
         test_tensordot.hybridize()
     a = rand_ndarray(shape = a_shape, dtype = dtype).as_np_ndarray()
     b = rand_ndarray(shape = b_shape, dtype = dtype).as_np_ndarray()
     a.attach_grad()
     b.attach_grad()

     np_out = onp.tensordot(a.asnumpy(), b.asnumpy(), axes)
     with mx.autograd.record():
         mx_out = test_tensordot(a, b)
     assert mx_out.shape == np_out.shape
     assert_almost_equal(mx_out.asnumpy(), np_out, rtol = 1e-3, atol = 1e-5)
     mx_out.backward()
     np_backward = tensordot_backward(onp.ones(np_out.shape), a.asnumpy(), b.asnumpy(), axes)
     assert_almost_equal(a.grad.asnumpy(), np_backward[0], rtol = 1e-3, atol=1e-5)
     assert_almost_equal(b.grad.asnumpy(), np_backward[1], rtol = 1e-3, atol=1e-5)

     # Test imperative once again
     mx_out = np.tensordot(a, b, axes)
     np_out = onp.tensordot(a.asnumpy(), b.asnumpy(), axes)
     assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)

     # test numeric gradient
     if (onp.prod(a_shape) > 0 and onp.prod(b_shape) > 0):
         a_sym = mx.sym.Variable("a").as_np_ndarray()
         b_sym = mx.sym.Variable("b").as_np_ndarray()
         mx_sym = mx.sym.np.tensordot(a_sym, b_sym, axes).as_nd_ndarray()
         check_numeric_gradient(mx_sym, [a.as_nd_ndarray(), b.as_nd_ndarray()],
           rtol=1e-1, atol=1e-1, dtype = dtype)

     # General Gradient Test
     for a_grad_status in ['add', 'write']:
         for b_grad_status in ['add', 'write']:
             a = mx.np.random.normal(0, 1, a_shape)
             b = mx.np.random.normal(0, 1, b_shape)
             a.attach_grad(a_grad_status)
             b.attach_grad(b_grad_status)
             if a_grad_status == 'add':
                 ori_a_grad = mx.np.random.normal(0, 1, a_shape)
                 if a.ndim == 0:
                     a.grad[()] = ori_a_grad
                 else:
                     a.grad[:] = ori_a_grad
             if b_grad_status == 'add':
                 ori_b_grad = mx.np.random.normal(0, 1, b_shape)
                 if b.ndim == 0:
                     b.grad[()] = ori_b_grad
                 else:
                     b.grad[:] = ori_b_grad

             with mx.autograd.record():
                 mx_out = mx.np.tensordot(a, b, axes)
                 out_grad = mx.np.random.normal(0, 1, mx_out.shape)
                 loss = (mx_out * out_grad).sum()
                 loss.backward()

             gt_in_grad = tensordot_backward(out_grad.asnumpy(), a.asnumpy(), b.asnumpy(), axes)

             if(a_grad_status == 'add'):
                 gt_in_grad[0] += ori_a_grad
             if(b_grad_status == 'add'):
                 gt_in_grad[1] += ori_b_grad

             assert_almost_equal(a.grad.asnumpy(), gt_in_grad[0], rtol=1e-2, atol=1e-2)
             assert_almost_equal(b.grad.asnumpy(), gt_in_grad[1], rtol=1e-2, atol=1e-2)


 @use_np
 @pytest.mark.parametrize('shape_a,shape_b', [
     ((3, 0), (0, 4)),
     ((3,), (3,)),
     ((3, 4), (4, 5)),
     ((), ()),
     ((3, 4, 5), ()),
     ((), (3, 4, 5)),
     ((3, 4, 5), (5, )),
     ((3, 4, 5), (5, 2)),
     ((5,), (5, 2)),
     ((3, 5, 4), (5, 4, 3)),
     ((3, 4), (5, 4, 3)),
     ((4,), (5, 4, 3))
 ])
 def test_np_dot(shape_a, shape_b):
     eps = 1e-3

     np_a = onp.random.uniform(-1.0, 1.0, shape_a)
     np_a[abs(np_a) < eps] = 2 * eps
     np_b = onp.random.uniform(-1.0, 1.0, shape_b)
     np_b[abs(np_b) < eps] = 2 * eps
     a = mx.nd.array(np_a)
     b = mx.nd.array(np_b)
     np_res = onp.dot(np_a, np_b)
     mx_res = np.dot(a.as_np_ndarray(), b.as_np_ndarray())
     assert mx_res.shape == np_res.shape
     assert_almost_equal(np_res, mx_res.asnumpy(), rtol=1e-5, atol=1e-5)
     mx_a = mx.sym.Variable("a")
     mx_b = mx.sym.Variable("b")
     mx_sym = mx.sym.np.dot(mx_a.as_np_ndarray(), mx_b.as_np_ndarray()).as_nd_ndarray()
     if (len(shape_a) > 0 and len(shape_b) > 0 and onp.prod(shape_a) > 0 and onp.prod(shape_b) > 0):
         check_numeric_gradient(mx_sym, {"a": a, "b": b}, numeric_eps=eps, rtol=1e-2, atol=1e-3)


 @use_np
 @pytest.mark.parametrize('shape_a,shape_b', [
     ((4, 5), (2, 3)),
     ((3, 4, 5), (6, ))
 ])
 def test_np_dot_error(shape_a, shape_b):
     a = mx.nd.array(random.random()) if len(shape_a) == 0 else rand_ndarray(shape_a)
     b = mx.nd.array(random.random()) if len(shape_b) == 0 else rand_ndarray(shape_b)
     with pytest.raises(mx.base.MXNetError):
         mx_res = np.dot(a.as_np_ndarray(), b.as_np_ndarray())


 @use_np
 @pytest.mark.parametrize('shape', [(), (5,), (3, 3)])
 @pytest.mark.parametrize('hybridize', [True, False])
 @pytest.mark.parametrize('dtype', [onp.float32, onp.float64])
 def test_np_vdot(shape, dtype, hybridize):
     class TestVdot(HybridBlock):
         def __init__(self):
             super(TestVdot, self).__init__()

         def forward(self, a, b):
             return np.vdot(a, b)

     def vdot_backward(a, b):
         return [b, a]

     test_vdot = TestVdot()
     if hybridize:
         test_vdot.hybridize()
     a = rand_ndarray(shape=shape, dtype=dtype).as_np_ndarray()
     b = rand_ndarray(shape=shape, dtype=dtype).as_np_ndarray()
     a.attach_grad()
     b.attach_grad()

     np_out = onp.vdot(a.asnumpy(), b.asnumpy())
     with mx.autograd.record():
         mx_out = test_vdot(a, b)
     assert mx_out.shape == np_out.shape
     assert_almost_equal(mx_out.asnumpy(), np_out, rtol = 1e-3, atol = 1e-5)
     mx_out.backward()
     np_backward = vdot_backward(a.asnumpy(), b.asnumpy())
     assert_almost_equal(a.grad.asnumpy(), np_backward[0], rtol = 1e-2, atol=1e-2)
     assert_almost_equal(b.grad.asnumpy(), np_backward[1], rtol = 1e-2, atol=1e-2)

     # Test imperative once again
     mx_out = np.vdot(a, b)
     np_out = onp.vdot(a.asnumpy(), b.asnumpy())
     assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)

     # test numeric gradient
     if len(shape) > 0 and onp.prod(shape) > 0:
         a_sym = mx.sym.Variable("a").as_np_ndarray()
         b_sym = mx.sym.Variable("b").as_np_ndarray()
         mx_sym = mx.sym.np.vdot(a_sym, b_sym).as_nd_ndarray()
         check_numeric_gradient(mx_sym, [a.as_nd_ndarray(), b.as_nd_ndarray()],
           rtol=1e-1, atol=1e-1, dtype=dtype)


 @use_np
 @pytest.mark.parametrize('a_shape,b_shape', [
     ((3,), (3,)),
     ((2, 3), (3,)),
     ((3,), (2, 3))
 ])
 @pytest.mark.parametrize('hybridize', [True, False])
 @pytest.mark.parametrize('dtype', [onp.float32, onp.float64])
 def test_np_inner(a_shape, b_shape, dtype, hybridize):
     class TestInner(HybridBlock):
         def __init__(self):
             super(TestInner, self).__init__()

         def forward(self, a, b):
             return np.inner(a, b)

     def inner_backward(a, b):
         a_axes_summed = [a.ndim - 1]
         b_axes_summed = [b.ndim - 1]

         a_axes_remained = []
         for i in range(a.ndim):
             if not (i in a_axes_summed):
                 a_axes_remained.append(i)
         a_axes = a_axes_remained[:] + a_axes_summed[:]

         b_axes_remained = []
         for i in range(b.ndim):
             if not (i in b_axes_summed):
                 b_axes_remained.append(i)
         b_axes = b_axes_summed[:] + b_axes_remained[:]

         ad1 = onp.prod([a.shape[i] for i in a_axes_remained]) if len(a_axes_remained) > 0 else 1
         ad2 = onp.prod([a.shape[i] for i in a_axes_summed]) if len(a_axes_summed) > 0 else 1
         bd1 = onp.prod([b.shape[i] for i in b_axes_summed]) if len(b_axes_summed) > 0 else 1
         bd2 = onp.prod([b.shape[i] for i in b_axes_remained]) if len(b_axes_remained) > 0 else 1

         out_grad = onp.ones((ad1, bd2))

         new_a = onp.transpose(a, a_axes)
         new_a_shape = new_a.shape[:]
         new_a = new_a.reshape((ad1, ad2))
         new_b = onp.transpose(b, b_axes)
         new_b_shape = new_b.shape[:]
         new_b = new_b.reshape((bd1, bd2))

         reverse_a_axes = [0 for i in a_axes]
         for i in range(len(a_axes)):
             reverse_a_axes[a_axes[i]] = i

         reverse_b_axes = [0 for i in b_axes]
         for i in range(len(b_axes)):
             reverse_b_axes[b_axes[i]] = i

         grad_b = onp.dot(new_a.T, out_grad).reshape(new_b_shape)
         grad_b = onp.transpose(grad_b, reverse_b_axes)
         grad_a = onp.dot(out_grad, new_b.T).reshape(new_a_shape)
         grad_a = onp.transpose(grad_a, reverse_a_axes)

         return [grad_a, grad_b]

     test_inner = TestInner()
     if hybridize:
         test_inner.hybridize()
     a = rand_ndarray(shape=a_shape, dtype=dtype).as_np_ndarray()
     b = rand_ndarray(shape=b_shape, dtype=dtype).as_np_ndarray()
     a.attach_grad()
     b.attach_grad()

     np_out = onp.inner(a.asnumpy(), b.asnumpy())
     with mx.autograd.record():
         mx_out = test_inner(a, b)
     assert mx_out.shape == np_out.shape
     assert_almost_equal(mx_out.asnumpy(), np_out, rtol = 1e-3, atol = 1e-5)
     mx_out.backward()
     np_backward = inner_backward(a.asnumpy(), b.asnumpy())
     assert_almost_equal(a.grad.asnumpy(), np_backward[0], rtol = 1e-2, atol=1e-2)
     assert_almost_equal(b.grad.asnumpy(), np_backward[1], rtol = 1e-2, atol=1e-2)

     # Test imperative once again
     mx_out = np.inner(a, b)
     np_out = onp.inner(a.asnumpy(), b.asnumpy())
     assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)

     # test numeric gradient
     a_sym = mx.sym.Variable("a").as_np_ndarray()
     b_sym = mx.sym.Variable("b").as_np_ndarray()
     mx_sym = mx.sym.np.inner(a_sym, b_sym).as_nd_ndarray()
     check_numeric_gradient(mx_sym, [a.as_nd_ndarray(), b.as_nd_ndarray()],
       rtol=1e-1, atol=1e-1, dtype=dtype)


 @use_np
 @pytest.mark.parametrize('a_shape,b_shape', [
     ((3,), (3,)),
     ((2, 3), (6,)),
     ((6,), (2, 3))
 ])
 @pytest.mark.parametrize('hybridize', [True, False])
 @pytest.mark.parametrize('dtype', [onp.float32, onp.float64])
 def test_np_outer(a_shape, b_shape, dtype, hybridize):
     class TestOuter(HybridBlock):
         def __init__(self):
             super(TestOuter, self).__init__()

         def forward(self, a, b):
             return np.outer(a, b)

     test_outer = TestOuter()
     if hybridize:
         test_outer.hybridize()
     a = rand_ndarray(shape=a_shape, dtype=dtype).as_np_ndarray()
     b = rand_ndarray(shape=b_shape, dtype=dtype).as_np_ndarray()
     a.attach_grad()
     b.attach_grad()

     np_out = onp.outer(a.asnumpy(), b.asnumpy())
     with mx.autograd.record():
         mx_out = test_outer(a, b)
     assert mx_out.shape == np_out.shape
     assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)
     mx_out.backward()

     # Test imperative once again
     mx_out = np.outer(a, b)
     np_out = onp.outer(a.asnumpy(), b.asnumpy())
     assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)

     # test numeric gradient
     a_sym = mx.sym.Variable("a").as_np_ndarray()
     b_sym = mx.sym.Variable("b").as_np_ndarray()
     mx_sym = mx.sym.np.outer(a_sym, b_sym).as_nd_ndarray()
     check_numeric_gradient(mx_sym, [a.as_nd_ndarray(), b.as_nd_ndarray()],
                            rtol=1e-1, atol=1e-1, dtype=dtype)


 @use_np
 @pytest.mark.parametrize('shape_a,shape_b', [
     ((3,), (3,)),
     ((3, 4), (4, 5)),
     ((3, 0), (0, 4)),
     ((4, 5), (5,)),
     ((3, 4, 5), (5,)),
     ((5,), (5, 2)),
     ((2,), (4, 2, 3)),
     ((2, 1, 3, 4, 5), (5, 2)),
     ((1, 3, 5, 4), (1, 4, 3)),
     ((3, 5, 4), (2, 1, 4, 3)),
     ((3, 4), (1, 5, 4, 3))
 ])
 @pytest.mark.parametrize('grad_req_a', ['write', 'add', 'null'])
 @pytest.mark.parametrize('grad_req_b', ['write', 'add', 'null'])
 @pytest.mark.parametrize('hybridize', [True, False])
 @pytest.mark.parametrize('dtype', [onp.float32, onp.float64])
 def test_np_matmul(shape_a, shape_b, grad_req_a, grad_req_b,
                    dtype, hybridize):
     class TestMatmul(HybridBlock):
         def __init__(self):
             super(TestMatmul, self).__init__()

         def forward(self, a, b):
             return np.matmul(a, b)

     def matmul_backward(a, b):
         def ShapeInfer(mat_a, mat_b):
             if mat_a.ndim == 1:
                 mat_a = mat_a.reshape((1, mat_a.size))
             if mat_b.ndim == 1:
                 mat_b = mat_b.reshape((mat_b.size, 1))
             ndim = max(mat_a.ndim, mat_b.ndim)
             newshape_a = list(onp.array(mat_a, ndmin=ndim).shape)
             newshape_b = list(onp.array(mat_b, ndmin=ndim).shape)
             if ndim >= 3:
                 pre_shape = onp.fmax(newshape_a[ndim - 3::-1], newshape_b[ndim - 3::-1])
                 newshape_a[ndim - 3::-1] = pre_shape
                 newshape_b[ndim - 3::-1] = pre_shape
             else:
                 pre_shape = onp.array([])
             out_shape = onp.append(pre_shape[::-1].astype(onp.int64), [newshape_a[ndim - 2], newshape_b[ndim - 1]])
             return [ndim, newshape_a, newshape_b, out_shape]

         def ShapeReduce(mat, shape, is_b=False):
             ndim = mat.ndim
             if is_b and len(shape) == 1:
                 rng = onp.arange(ndim - 2)
             else:
                 pre_len = ndim - len(shape)
                 in_pre = onp.array(mat.shape[pre_len : ndim - 2])
                 out_pre = onp.array(shape[:len(shape) - 2])
                 diff = onp.nonzero(in_pre != out_pre)[0] + pre_len
                 rng = onp.append(onp.arange(ndim - len(shape)), diff)
             mat = onp.sum(mat, axis=tuple(rng))
             return mat.reshape(shape)

         a_shape = a.shape
         b_shape = b.shape
         [ndim, newshape_a, newshape_b, out_shape] = ShapeInfer(a, b)
         new_a = onp.broadcast_to(a, newshape_a)
         if len(b_shape) == 1:
             new_b = onp.broadcast_to(b.reshape((b.size, 1)), newshape_b)
         else:
             new_b = onp.broadcast_to(b, newshape_b)

         ad1 = new_a.shape[ndim - 2]
         ad2 = new_a.shape[ndim - 1]
         bd1 = new_b.shape[ndim - 2]
         bd2 = new_b.shape[ndim - 1]
         a_T = onp.moveaxis(new_a, [ndim - 2, ndim - 1], [ndim - 1, ndim - 2])
         b_T = onp.moveaxis(new_b, [ndim - 2, ndim - 1], [ndim - 1, ndim - 2])
         out_grad = onp.ones(out_shape)
         grad_b = onp.matmul(a_T, out_grad)
         grad_b = ShapeReduce(grad_b, b_shape, is_b=True)
         grad_a = onp.matmul(out_grad, b_T)
         grad_a = ShapeReduce(grad_a, a_shape)
         return [grad_a, grad_b]

     eps = 1E-4
     test_matmul = TestMatmul()
     if hybridize:
         test_matmul.hybridize()
     np_a = onp.random.uniform(-1.0, 1.0, shape_a).astype(dtype)
     np_a[abs(np_a) < eps] = 2 * eps
     np_b = onp.random.uniform(-1.0, 1.0, shape_b).astype(dtype)
     np_b[abs(np_b) < eps] = 2 * eps
     a = mx.np.array(np_a, dtype=dtype)
     a.attach_grad(grad_req=grad_req_a)
     b = mx.np.array(np_b, dtype=dtype)
     b.attach_grad(grad_req=grad_req_b)

     np_out = onp.matmul(np_a, np_b)
     with mx.autograd.record():
         mx_out = test_matmul(a, b)
     assert mx_out.shape == np_out.shape
     assert_almost_equal(np_out, mx_out.asnumpy(), rtol=eps, atol=eps)

     if grad_req_a != 'null' or grad_req_b != 'null':
         mx_out.backward()
         np_backward = matmul_backward(np_a, np_b)
         if grad_req_a == 'null':
             assert a.grad is None
         else:
             assert_almost_equal(a.grad.asnumpy(), np_backward[0], rtol = eps, atol=eps)
         if grad_req_b == 'null':
             assert b.grad is None
         else:
             assert_almost_equal(b.grad.asnumpy(), np_backward[1], rtol = eps, atol=eps)

     mx_out = np.matmul(a, b)
     np_out = onp.matmul(np_a, np_b)
     assert_almost_equal(mx_out.asnumpy(), np_out, rtol=eps, atol=eps)


 @pytest.mark.parametrize('shape_a,shape_b', [
     ((1,), (2,)),            # mismatched vector vector
     ((2, 1,), (2,)),         # mismatched matrix vector
     ((2,), (1, 2)),          # mismatched vector matrix
     ((1, 2), (3, 1)),        # mismatched matrix matrix
     ((1,), ()),              # vector scalar
     ((), (1,)),              # scalar vector
     ((1, 1), ()),            # matrix scalar
     ((), (1, 1)),            # scalar matrix
     ((2, 2, 1), (3, 1, 2)),  # cannot broadcast
 ])
 def test_np_matmul_error(shape_a, shape_b):
     a = np.random.uniform(size=shape_a)
     b = np.random.uniform(size=shape_b)
     with pytest.raises(MXNetError):
         np.matmul(a, b)


 @use_np
 @pytest.mark.parametrize('a_shape,b_shape', [
     ((3,), (3,)),
     ((2, 3), (3,)),
     ((2, 3, 4), (2,)),
     ((3, 2), ())
 ])
 @pytest.mark.parametrize('dtype', [onp.float32, onp.float64])
 @pytest.mark.parametrize('hybridize', [True, False])
 def test_np_kron(a_shape, b_shape, dtype, hybridize):
     def np_kron_backward(ograd, a, b):
         ndim = ograd.ndim
         # Make ndim equal
         if ndim > a.ndim:
             a = a.reshape((1,)*(ndim - a.ndim) + a.shape)
         else:
             b = b.reshape((1,)*(ndim - b.ndim) + b.shape)
         assert(a.ndim == b.ndim)

         # Compute agrad
         agrad = onp.zeros(a.shape)
         for i in range(a.size):
             ia = onp.asarray(onp.unravel_index(i, a.shape))
             for j in range(b.size):
                 jb = onp.asarray(onp.unravel_index(j, b.shape))
                 k = ia * onp.asarray(b.shape) + jb
                 agrad[tuple(ia)] += ograd[tuple(k)] * b[tuple(jb)]
         # Compute bgrad
         bgrad = onp.zeros(b.shape)
         for j in range(b.size):
             jb = onp.asarray(onp.unravel_index(j, b.shape))
             for i in range(a.size):
                 ia = onp.asarray(onp.unravel_index(i, a.shape))
                 k = ia * onp.asarray(b.shape) + jb
                 bgrad[tuple(jb)] += ograd[tuple(k)] * a[tuple(ia)]
         return [agrad, bgrad]

     class TestKron(HybridBlock):
         def __init__(self):
             super(TestKron, self).__init__()

         def forward(self, a, b):
             return np.kron(a, b)

     test_kron = TestKron()
     if hybridize:
         test_kron.hybridize()
     a = rand_ndarray(shape=a_shape, dtype=dtype).as_np_ndarray()
     b = rand_ndarray(shape=b_shape, dtype=dtype).as_np_ndarray()
     a.attach_grad()
     b.attach_grad()

     np_out = onp.kron(a.asnumpy(), b.asnumpy())
     with mx.autograd.record():
         mx_out = test_kron(a, b)
     assert mx_out.shape == np_out.shape
     assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5, use_broadcast=False)
     mx_out.backward()

     # Test imperative once again
     mx_out = np.kron(a, b)
     np_out = onp.kron(a.asnumpy(), b.asnumpy())
     assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5, use_broadcast=False)

     # test numeric gradient
     a_sym = mx.sym.Variable("a").as_np_ndarray()
     b_sym = mx.sym.Variable("b").as_np_ndarray()
     mx_sym = mx.sym.np.kron(a_sym, b_sym).as_nd_ndarray()
     check_numeric_gradient(mx_sym, [a.as_nd_ndarray(), b.as_nd_ndarray()],
                            rtol=1e-2, atol=1e-2, dtype=dtype)

     # test gradient via backward implemented by numpy
     np_backward = np_kron_backward(onp.ones(np_out.shape, dtype = dtype), a.asnumpy(), b.asnumpy())
     assert_almost_equal(a.grad.asnumpy(), np_backward[0], rtol=1e-2, atol=1e-2)
     assert_almost_equal(b.grad.asnumpy(), np_backward[1], rtol=1e-2, atol=1e-2)


 @use_np
 @pytest.mark.parametrize('shape', [rand_shape_nd(4, dim=4), (4, 0, 4, 0)])
 @pytest.mark.parametrize('axis', [0, 1, 2, 3, (), None])
 @pytest.mark.parametrize('keepdims', [True, False])
 @pytest.mark.parametrize('dtype', ['float16', 'float32', 'float64', 'int8', 'int32', 'int64'])
 @pytest.mark.parametrize('itype,acc_type', [
     ('float16', 'float32'),
     ('float32', 'float64'),
     ('float64', 'float64'),
     ('int8', 'int32'),
     ('int32', 'int64'),
     ('int64', 'int64'),
     ('bool', 'int64')
 ])
 @pytest.mark.parametrize('hybridize', [True, False])
 def test_np_sum(shape, axis, keepdims, itype, acc_type, dtype, hybridize):
     class TestSum(HybridBlock):
         def __init__(self, axis=None, dtype=None, keepdims=False):
             super(TestSum, self).__init__()
             self._axis = axis
             self._dtype = dtype
             self._keepdims = keepdims

         def forward(self, a, *args, **kwargs):
             return np.sum(a, axis=self._axis, dtype=self._dtype, keepdims=self._keepdims)

     class TestSumConv(HybridBlock):
         def __init__(self, axis=None, dtype=None, keepdims=False):
             super(TestSumConv, self).__init__()
             self._axis = axis
             self._dtype = dtype
             self._keepdims = keepdims

         def forward(self, a, *args, **kwargs):
             return a.sum(axis=self._axis, dtype=self._dtype, keepdims=self._keepdims)

     def is_int(dtype):
         return 'int' in dtype

     is_windows = sys.platform.startswith('win')
     if (is_int(dtype) and not is_int(itype)) or (is_windows and is_int(itype))\
             or (itype == 'bool' and\
                 (dtype not in ('float32', 'float64', 'int32', 'int64') or is_windows)):
         return
     # test gluon
     test_sum = TestSum(axis=axis, dtype=dtype, keepdims=keepdims)
     test_sum_conv = TestSumConv(axis=axis, dtype=dtype, keepdims=keepdims)
     if hybridize:
         test_sum.hybridize()
         test_sum_conv.hybridize()
     if is_int(itype):
         x = onp.random.randint(-128, 128, shape, dtype=itype)
         x = np.array(x)
     elif itype == 'bool':
         x = onp.random.randint(0, 2, shape) < 1
         x = np.array(x, dtype='bool')
     else:
         x = np.random.uniform(-1.0, 1.0, size=shape, dtype=itype)
     expected_ret = onp.sum(x.asnumpy(), axis=axis, dtype=acc_type, keepdims=keepdims)
     expected_ret = expected_ret.astype(dtype)
     if itype == 'bool':
         if is_op_runnable() and (not is_windows):  # special handling of boolean ndarray
             y = test_sum(x)
             y_conv = test_sum_conv(x)
             assert y.dtype == expected_ret.dtype
             assert_almost_equal(y.asnumpy(), expected_ret, rtol=1e-4, atol=1e-5,
                                 use_broadcast=False)
             assert y_conv.dtype == expected_ret.dtype
             assert_almost_equal(y_conv.asnumpy(), expected_ret, rtol=1e-4, atol=1e-5,
                                 use_broadcast=False)
         return

     x.attach_grad()
     with mx.autograd.record():
         y = test_sum(x)
         y_conv = test_sum_conv(x)
     assert y.shape == expected_ret.shape
     assert_almost_equal(y.asnumpy(), expected_ret, rtol=1e-3 if dtype == 'float16' else 1e-3,
                         atol=1e-5 if dtype == 'float16' else 1e-5, use_broadcast=False)
     assert y_conv.shape == expected_ret.shape
     assert_almost_equal(y_conv.asnumpy(), expected_ret, rtol=1e-3 if dtype == 'float16' else 1e-3,
                         atol=1e-5 if dtype == 'float16' else 1e-5, use_broadcast=False)
     y.backward()
     assert same(x.grad.asnumpy(), onp.ones(shape=x.shape, dtype=x.dtype))

     # test numeric
     if itype == 'float32' and dtype == 'float32' and shape != (4, 0, 4, 0):
         x_sym = mx.sym.Variable("x").as_np_ndarray()
         mx_sym = mx.sym.np.sum(x_sym, axis=axis, dtype=dtype, keepdims=keepdims).as_nd_ndarray()
         check_numeric_gradient(mx_sym, [x.as_nd_ndarray()],
                                 numeric_eps=1e-3, rtol=1e-2, atol=1e-3, dtype=onp.float32)

     # test imperative
     mx_out = np.sum(x, axis=axis, dtype=dtype, keepdims=keepdims)
     np_out = onp.sum(x.asnumpy(), axis=axis, dtype=acc_type, keepdims=keepdims).astype(dtype)
     assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5, use_broadcast=False)


 @use_np
 @pytest.mark.parametrize('bool_agg', ['all', 'any'])
 @pytest.mark.parametrize('shape', [
     (), (5, ), (10, ), (2, 5), (5, 5), (10, 10),
     (4, 4, 4), (4, 6, 9), (6, 6, 6), (6, 0, 5),
     (7, 8, 9, 10), (7, 9, 11, 13), (0, 7, 7, 5)
 ])
 @pytest.mark.parametrize('axis', [True, False])
 @pytest.mark.parametrize('hybridize', [True, False])
 @pytest.mark.parametrize('keepdim', [True, False])
 @pytest.mark.parametrize('dtype', [np.int8, np.uint8, np.int32, np.int64, np.float16, np.float32, np.float64, np.bool])
 def test_np_bool_agg(bool_agg, shape, axis, keepdim, dtype, hybridize):
     class TestOp(HybridBlock):
         def __init__(self, axis=None, keepdims=False) :
             super(TestOp, self).__init__()
             self._axis = axis
             self._keepdims = keepdims

         def forward(self, a):
             return getattr(np, bool_agg)(a, axis=self._axis, keepdims=self._keepdims)

     ndim = len(shape)
     samples = random.randint(0, ndim)
     axis = None if not axis else tuple(random.sample([i for i in range(0, ndim)], samples))
     x = np.random.normal(0, 5.0, size=shape).astype(dtype)
     test_op = TestOp(axis=axis, keepdims=keepdim)
     if hybridize:
         test_op.hybridize()
     y = test_op(x)
     expected_ret = getattr(onp, bool_agg)(x.asnumpy(), axis=axis, keepdims=keepdim)
     assert_almost_equal(y.asnumpy(), expected_ret)

     # test imperative
     mx_outs = getattr(np, bool_agg)(x, axis=axis, keepdims=keepdim)
     np_outs = getattr(onp, bool_agg)(x.asnumpy(), axis=axis, keepdims=keepdim)
     assert_almost_equal(mx_outs.asnumpy(), np_outs)


 @use_np
 @pytest.mark.parametrize('func', ['max', 'min'])
 @pytest.mark.parametrize('in_data_dim', [2, 3, 4])
 @pytest.mark.parametrize('itype', ['float16', 'float32', 'float64', 'int'])
 @pytest.mark.parametrize('hybridize', [True, False])
 @pytest.mark.parametrize('keepdims', [True, False])
 def test_np_max_min(func, in_data_dim, itype, keepdims, hybridize):
     class TestOp(HybridBlock):
         def __init__(self, axis=None, keepdims=False):
             super(TestOp, self).__init__()
             self._axis = axis
             self._keepdims = keepdims

         def forward(self, a, *args, **kwargs):
             return getattr(a, func)(axis=self._axis, keepdims=self._keepdims)

     def is_int(dtype):
         return 'int' == dtype

     def get_grad(axis, func_name):
         index = -1 if func_name == 'max' else 0
         if axis == ():
             return onp.ones((2,3,4,5))
         else:
             temp = onp.zeros((2,3,4,5))
             if axis == 0:
                 temp[index,:,:,:] = 1
                 return temp
             elif axis == 1:
                 temp[:,index,:,:] = 1
                 return temp
             elif axis == 2:
                 temp[:,:,index,:] = 1
                 return temp
             elif (axis == 3 or axis == -1):
                 temp[:,:,:,index] = 1
                 return temp
             elif not axis:
                 temp[index,index,index,index] = 1
                 return temp
             raise ValueError('axis should be int or None or ()')

     shape = rand_shape_nd(in_data_dim, dim=3)
     for axis in ([i for i in range(in_data_dim)] + [(), None] + [-1]):
         test_gluon = TestOp(axis=axis, keepdims=keepdims)
         if hybridize:
             test_gluon.hybridize()
         if is_int(itype):
             x = np.arange(120).reshape((2, 3, 4, 5))
         else:
             x = np.random.uniform(-1.0, 1.0, size=shape, dtype=itype)
         x.attach_grad()
         ref_op = getattr(onp, 'a'+func)
         expected_ret = ref_op(x.asnumpy(), axis=axis, keepdims=keepdims)
         with mx.autograd.record():
             y = test_gluon(x)
         assert y.shape == expected_ret.shape
         assert_almost_equal(y.asnumpy(), expected_ret, rtol=1e-3 if itype == 'float16' else 1e-3,
                             atol=1e-5 if itype == 'float16' else 1e-5)
         y.backward()
         # only check the gradient with hardcoded input
         if is_int(itype):
             assert same(x.grad.asnumpy(), get_grad(axis, func)), \
                 'x={}\ny={}\nx.grad={}\nnumpy={}'.format(x.asnumpy(), y.asnumpy(), x.grad.asnumpy(), get_grad(axis))

         # test imperative
         mx_out = getattr(np, func)(x, axis=axis, keepdims=keepdims)
         np_out = ref_op(x.asnumpy(), axis=axis, keepdims=keepdims)
         assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)

 @use_np
 @pytest.mark.parametrize('func', ['max', 'min'])
 @pytest.mark.parametrize('shape,exception', [
     ((), False),
     ((0), True),
     ((2, 0), True),
     ((0, 2, 1), True)
 ])
 def test_np_max_min_error(func, shape, exception):
     # test zero and zero dim
     def _test_np_exception(func, shape, dim):
         x = np.random.uniform(-1.0, 1.0, shape)
         out = getattr(x, func)()
         assert out.ndim == dim, 'dimension mismatch, output.ndim={}, dim={}'.format(output.ndim, dim)
     dim = 0
     if exception:
         assertRaises(MXNetError, _test_np_exception, func, shape, dim)
     else:
         _test_np_exception(func, shape, dim)


 @use_np
 @pytest.mark.parametrize('a_shape,w_shape,axes', [
     ((3, 5), (3, 5), None),
     ((4, 5, 6), (4, 5, 6), (0, 2)),
     ((3,), (3,), 0),
     ((2, 3), (3,), 1),
     ((2, 3, 4), (2,), 0),
     ((2, 3, 4), (3,), 1),
     ((2, 3, 4), (4,), -1),
     ((2, 3, 4, 5), (5,), 3)
 ])
 @pytest.mark.parametrize('dtype', ['float32', 'float64'])
 @pytest.mark.parametrize('hybridize', [True, False])
 @pytest.mark.parametrize('is_weighted', [True, False])
 @pytest.mark.parametrize('returned', [True, False])
 @pytest.mark.parametrize('req_a', ['null', 'add', 'write'])
 @pytest.mark.flaky
 def test_np_average(a_shape, w_shape, axes, is_weighted, req_a,
                     hybridize, returned, dtype):
     class TestAverage(HybridBlock):
         def __init__(self, axis=None, returned=False):
             super(TestAverage, self).__init__()
             # necessary initializations
             self._axis = axis
             self._returned = returned

         def forward(self, a, weights):
             return np.average(a, weights=weights, axis=self._axis, returned=self._returned)

     def avg_backward(a, w, avg, axes, init_a_grad=None, init_w_grad=None):
         # avg = sum(a * w) / sum(w)
         if axes is not None and not isinstance(axes, tuple) and axes < 0:
             axes += a.ndim
         if w is None:
             a_grad = onp.ones(shape=a.shape, dtype=a.dtype)/(a.size/avg.size)
             if init_a_grad is not None:
                 a_grad += init_a_grad.asnumpy()
             return [a_grad, None]
         onedim = a.ndim != w.ndim
         if onedim:
             new_shape = [a.shape[i] if i == axes else 1 for i in range(a.ndim)]
             w = w.reshape(new_shape)
             w = onp.broadcast_to(w, a.shape)

         # partial a = w / sum(w)
         # partial w = (a*sum(w) - sum(a*w)) / (sum(w) * sum(w))
         scl = onp.sum(w, axis=axes, keepdims=True)
         a_grad = onp.divide(w, scl)
         w_grad = onp.divide(a*scl-onp.sum(a*w, axis=axes, keepdims=True), scl*scl)

         if onedim:
             axis = list(range(a.ndim))
             axis.remove(axes)
             w_grad = onp.sum(w_grad, axis=tuple(axis))
         if init_a_grad is not None:
             a_grad += init_a_grad.asnumpy()
         if init_w_grad is not None:
             w_grad += init_w_grad.asnumpy()
         return [a_grad, w_grad]

     if req_a == 'null' and not is_weighted:
         return
     rtol, atol = 1e-3, 1e-4
     test_average = TestAverage(axes, returned)
     if hybridize:
         test_average.hybridize()
     a = np.random.uniform(-1.0, 1.0, size=a_shape, dtype=dtype)
     a.attach_grad(req_a)
     init_a_grad = np.random.uniform(-1.0, 1.0, size=a_shape, dtype=dtype) if req_a == 'add' else None
     init_w_grad = None
     req_w = req_a
     w, np_w = None, None
     if is_weighted:
         w = np.random.uniform(-1.0, 1.0, size=w_shape, dtype=dtype)
         if req_a == 'null':
             req_w = random.choice(['add', 'write'])
         w.attach_grad(req_w)
         if req_w == 'add':
             init_w_grad = np.random.uniform(-1.0, 1.0, size=w_shape, dtype=dtype)
         np_w = w.asnumpy()
     np_out = onp.average(a.asnumpy(), axis=axes, weights=np_w, returned=returned)
     with mx.autograd.record():
         mx_out = test_average(a, w)
     if returned:
         np_out, np_sum_of_weights = np_out
         mx_out, mx_sum_of_weights = mx_out
         assert_almost_equal(mx_sum_of_weights.asnumpy(), np_sum_of_weights, rtol=rtol, atol=atol)
     assert mx_out.shape == np_out.shape
     assert_almost_equal(mx_out.asnumpy(), np_out, rtol=rtol, atol=atol)
     if req_a == 'add':
         a.grad[:] = init_a_grad
     if is_weighted and req_w == 'add':
         w.grad[:] = init_w_grad
     mx_out.backward()
     # Code to get reference backward value
     a_grad, w_grad = avg_backward(a.asnumpy(), np_w, np_out, axes, init_a_grad, init_w_grad)
     if is_weighted:
         assert_almost_equal(w.grad.asnumpy(), w_grad, rtol=rtol*10, atol=atol*10)
     if req_a == 'null':
         assert a.grad is None
     else:
         assert_almost_equal(a.grad.asnumpy(), a_grad, rtol=rtol, atol=atol)

     # Test imperative once again
     np_out = onp.average(a.asnumpy(), weights=np_w, axis=axes, returned=returned)
     mx_out = np.average(a, weights=w, axis=axes, returned=returned)
     if returned:
         np_out, np_sum_of_weights = np_out
         mx_out, mx_sum_of_weights = mx_out
         assert_almost_equal(mx_sum_of_weights.asnumpy(), np_sum_of_weights, rtol=rtol, atol=atol)
     assert_almost_equal(mx_out.asnumpy(), np_out, rtol=rtol, atol=atol)


 @use_np
 def test_np_mean():
     class TestMean(HybridBlock):
         def __init__(self, axis=None, dtype=None, keepdims=False):
             super(TestMean, self).__init__()
             self._axis = axis
             self._dtype = dtype
             self._keepdims = keepdims

         def forward(self, a, *args, **kwargs):
             return a.mean(axis=self._axis, dtype=self._dtype, keepdims=self._keepdims)

     def is_int(dtype):
         return 'int' in dtype

     is_windows = sys.platform.startswith('win')
     in_data_dim = random.choice([2, 3, 4])
     shape = rand_shape_nd(in_data_dim, dim=3)
     acc_type = {'float16': 'float32', 'float32': 'float64', 'float64': 'float64',
                 'bool': 'int64', 'int8': 'int32', 'int32': 'int64', 'int64': 'int64'}
     ft_types = ['float16', 'float32', 'float64']
     it_types = ['bool', 'int8', 'int32', 'int64']
     for hybridize in [False, True]:
         for keepdims in [True, False]:
             for axis in ([i for i in range(in_data_dim)] + [(), None]):
                 for itype, dtype in itertools.product(ft_types, [None] + ft_types + it_types):
                     if dtype == 'bool':
                         continue
                     # test gluon
                     test_mean = TestMean(axis=axis, dtype=dtype, keepdims=keepdims)
                     if hybridize:
                         test_mean.hybridize()
                     x = np.random.uniform(-1.0, 1.0, size=shape).astype(itype)
                     x = x.as_np_ndarray()
                     x.attach_grad()

                     expected_ret = onp.mean(x.asnumpy(), axis=axis, dtype=acc_type[itype], keepdims=keepdims)
                     expected_ret = expected_ret.astype(dtype)
                     with mx.autograd.record():
                         y = test_mean(x)
                     assert y.shape == expected_ret.shape
                     assert_almost_equal(y.asnumpy(), expected_ret, rtol=1e-3 if dtype == 'float16' else 1e-3,
                                         atol=1e-5 if dtype == 'float16' else 1e-5)

                     y.backward()
                     N = x.size / y.size
                     assert same(x.grad.asnumpy(), onp.ones(shape=x.shape, dtype=x.dtype) / N)

                     # test numeric
                     if itype == 'float32' and dtype == 'float32':
                         x_sym = mx.sym.Variable("x").as_np_ndarray()
                         mx_sym = mx.sym.np.mean(x_sym, axis=axis, dtype=dtype, keepdims=keepdims).as_nd_ndarray()
                         check_numeric_gradient(mx_sym, [x.as_nd_ndarray()],
                                                numeric_eps=1e-3, rtol=1e-3, atol=1e-4, dtype=onp.float32)

                     # test imperative
                     mx_out = np.mean(x, axis=axis, dtype=dtype, keepdims=keepdims)
                     np_out = onp.mean(x.asnumpy(), axis=axis, dtype=acc_type[itype], keepdims=keepdims).astype(dtype)
                     assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)

                 for itype, dtype in itertools.product(it_types, [None] + ft_types + it_types):
                     if dtype == 'bool':
                         continue
                     # test gluon
                     test_mean = TestMean(axis=axis, dtype=dtype, keepdims=keepdims)
                     if hybridize:
                         test_mean.hybridize()

                     if itype == 'bool':
                         x = np.array(onp.random.uniform(size=shape) > 0.5)
                     else:
                         x = np.random.uniform(-128, 127, size=shape).astype(itype)

                     expected_ret = onp.mean(x.asnumpy(), axis=axis, dtype=dtype, keepdims=keepdims)

                     if itype == 'bool':
                         if is_op_runnable() and (not is_windows) and dtype not in ['float16', 'int8']:  # special handling of boolean ndarray
                             y = test_mean(x)
                             assert y.shape == expected_ret.shape
                             assert_almost_equal(y.asnumpy(), expected_ret, rtol=1e-3 if dtype == 'float16' else 1e-3,
                                                 atol=1e-5 if dtype == 'float16' else 1e-5)
                         continue

                     y = test_mean(x)
                     assert y.shape == expected_ret.shape
                     assert_almost_equal(y.asnumpy(), expected_ret, rtol=1e-3 if dtype == 'float16' else 1e-3,
                                         atol=1e-5 if dtype == 'float16' else 1e-5)

                     # test imperative
                     mx_out = np.mean(x, axis=axis, dtype=dtype, keepdims=keepdims)
                     np_out = onp.mean(x.asnumpy(), axis=axis, dtype=dtype, keepdims=keepdims).astype(dtype)
                     assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)


 @use_np
 def test_np_moment():
     class TestMoment(HybridBlock):
         def __init__(self, name, axis=None, dtype=None, keepdims=False, ddof=0):
             super(TestMoment, self).__init__()
             self._moment_name = name
             self._axis = axis
             self._dtype = dtype
             self._keepdims = keepdims
             self._ddof = ddof

         def forward(self, a, *args, **kwargs):
             return getattr(a, self._moment_name)(axis=self._axis, dtype=self._dtype,
                                                  keepdims=self._keepdims, ddof=self._ddof)

     def is_int(dtype):
         return 'int' in dtype

     def legalize_shape(shape):
         shape_ = list(shape)
         for i in range(len(shape_)):
             shape_[i] += 1
         return tuple(shape_)

     in_data_dim = random.choice([2, 3, 4])
     shape = rand_shape_nd(in_data_dim, dim=3)
     shape = legalize_shape(shape)
     acc_type = {'float16': 'float32', 'float32': 'float64', 'float64': 'float64',
                 'int8': 'float64', 'int32': 'float64', 'int64': 'float64'}

     for name in ['var', 'std']:
         for hybridize in [False, True]:
             for ddof in [0, 1]:
                 for keepdims in [True, False]:
                     for axis in ([i for i in range(in_data_dim)] + [(), None]):
                         for itype in ['float16', 'float32', 'float64', 'int8', 'int32', 'int64']:
                             for dtype in ['float16', 'float32', 'float64']:
                                 if is_int(dtype) and not is_int(itype) or is_int(itype) and is_int(dtype):
                                     continue
                                 atol = 3e-4 if itype == 'float16' or dtype == 'float16' else 1e-5
                                 rtol = 1e-2 if itype == 'float16' or dtype == 'float16' else 1e-3
                                 # test gluon
                                 test_moment = TestMoment(name, axis=axis, dtype=dtype, keepdims=keepdims, ddof=ddof)
                                 if hybridize:
                                     test_moment.hybridize()
                                 if is_int(itype):
                                     x = onp.random.randint(-16, 16, shape, dtype=itype)
                                     x = mx.nd.array(x)
                                 else:
                                     x = mx.nd.random.uniform(-1.0, 1.0, shape=shape, dtype=itype)
                                 x = x.as_np_ndarray()
                                 x.attach_grad()
                                 expected_ret = getattr(onp, name)(x.asnumpy(), axis=axis, dtype=acc_type[itype], keepdims=keepdims, ddof=ddof)
                                 expected_ret = expected_ret.astype(dtype)
                                 y = test_moment(x)
                                 assert y.shape == expected_ret.shape
                                 assert_almost_equal(y.asnumpy(), expected_ret, rtol=rtol, atol=atol, use_broadcast=False, equal_nan=True)

                                 # test imperative
                                 mx_out = getattr(np, name)(x, axis=axis, dtype=dtype, keepdims=keepdims, ddof=ddof)
                                 np_out = getattr(onp, name)(x.asnumpy(), axis=axis, dtype=acc_type[itype], keepdims=keepdims, ddof=ddof).astype(dtype)
                                 assert_almost_equal(mx_out.asnumpy(), np_out, rtol=rtol, atol=atol, use_broadcast=False, equal_nan=True)


 @use_np
 def test_np_shape():
     shapes = [
         (),
         (0, 1),
         (2, 3),
         (2, 3, 4),
     ]

     for shape in shapes:
         mx_a = np.random.uniform(size=shape)
         np_a = onp.random.uniform(size=shape)

         mx_shape = np.shape(mx_a)
         np_shape = onp.shape(np_a)

         assert mx_shape == np_shape


 @use_np
 @pytest.mark.parametrize('config', [
     (0.0, 1.0, 10),
     (-2, 4, 30),
     (5.234324, 8.98324, 324),
     (2, 10, 100)
 ])
 @pytest.mark.parametrize('dtype', ['int32', 'float16', 'float32', 'float64', None])
 @pytest.mark.parametrize('endpoint', [True, False])
 @pytest.mark.parametrize('retstep', [True, False])
 def test_np_linspace(config, dtype, endpoint, retstep):
     if isinstance(config, tuple):
         mx_ret = np.linspace(*config, endpoint=endpoint, retstep=retstep, dtype=dtype)
         np_ret = onp.linspace(*config, endpoint=endpoint, retstep=retstep, dtype=dtype)
     else:
         mx_ret = np.linspace(config, endpoint=endpoint, retstep=retstep, dtype=dtype)
         np_ret = onp.linspace(config, endpoint=endpoint, retstep=retstep, dtype=dtype)
     if retstep:
         assert_almost_equal(mx_ret[0].asnumpy(), np_ret[0], atol=1e-3, rtol=1e-5)
         assert same(mx_ret[1], np_ret[1])
     else:
         assert_almost_equal(mx_ret.asnumpy(), np_ret, atol=1e-3, rtol=1e-5)

 @use_np
 @pytest.mark.parametrize('config', [
     (0.0, 1.0, 10),
     (-2, 4, 30),
     (5.234324, 8.98324, 324),
     (2, 10, 100)
 ])
 @pytest.mark.parametrize('dtype', ['int32', 'float16', 'float32', 'float64', None])
 @pytest.mark.parametrize('hybridize', [True, False])
 @pytest.mark.parametrize('endpoint', [True, False])
 def test_np_linspace_gluon(config, dtype, endpoint, hybridize):
     class TestLinspace(HybridBlock):
         def __init__(self, start, stop, num=50, endpoint=None, retstep=False, dtype=None, axis=0):
             super(TestLinspace, self).__init__()
             self._start = start
             self._stop = stop
             self._num = num
             self._endpoint = endpoint
             self._retstep = retstep
             self._dtype = dtype

         def forward(self, x):
             if self._retstep:
                 raise ValueError("linspace didn't support retstep = True inside HybridBlock")
             else:
                 return x + np.linspace(self._start, self._stop, num=self._num, \
                 endpoint=self._endpoint, retstep=self._retstep, dtype=self._dtype)

     x = np.zeros(shape=(), dtype=dtype)
     if isinstance(config, tuple):
         net = TestLinspace(*config, endpoint=endpoint, dtype=dtype)
         np_out = onp.linspace(*config, endpoint=endpoint, dtype=dtype)
     else:
         net = TestLinspace(config, endpoint=endpoint, dtype=dtype)
         np_out = onp.linspace(config, endpoint=endpoint, dtype=dtype)
     if hybridize:
         net.hybridize()
     mx_out = net(x)
     assert_almost_equal(mx_out.asnumpy(), np_out, atol=1e-3, rtol=1e-5)

 @use_np
 @pytest.mark.parametrize('config', [
     (0, 10, -1),
     (0, 1, 2.5)
 ])
 def test_np_linspace_error(config):
     with pytest.raises(MXNetError):
         np.linspace(*config)


 @use_np
 def test_np_linspace_arange():
     # check linspace equivalent to arange
     for test_index in range(1000):
         assert_almost_equal(mx.np.linspace(0, test_index, test_index + 1).asnumpy(), onp.arange(test_index + 1))


 @use_np
 @pytest.mark.parametrize('config', [
     (0.0, 1.0, 20),
     (2, 8, 0),
     (22, 11, 1),
     (2.22, 9.99, 11),
     (4.99999, 12.11111111, 111)
 ])
 @pytest.mark.parametrize('dtype', ['float32', 'float64', None])
 @pytest.mark.parametrize('hybridize', [True, False])
 @pytest.mark.parametrize('endpoint', [True, False])
 @pytest.mark.parametrize('base', [0, 1, 5, 8, 10, 33])
 def test_np_logspace(config, dtype, endpoint, hybridize, base):
     class TestLogspace(HybridBlock):
         def __init__(self, start, stop, num=50, endpoint=None, base=50.0, dtype=None, axis=0):
             super(TestLogspace, self).__init__()
             self._start = start
             self._stop = stop
             self._num = num
             self._endpoint = endpoint
             self._base = base
             self._dtype = dtype
             self.axis = axis

         def forward(self, x):
             return x + np.logspace(self._start, self._stop, self._num, self._endpoint, self._base, self._dtype, self.axis)

     x = np.zeros(shape=(), dtype=dtype)
     net = TestLogspace(*config, endpoint=endpoint, base=base, dtype=dtype)
     np_out = onp.logspace(*config, endpoint=endpoint, base=base, dtype=dtype)
     if hybridize:
         net.hybridize()
     mx_out = net(x)
     assert_almost_equal(mx_out.asnumpy(), np_out, atol=1e-3, rtol=1e-5)
     if dtype is not None:
         assert mx_out.dtype == np_out.dtype

     # Test imperative once again
     mx_ret = np.logspace(*config, endpoint=endpoint, base=base, dtype=dtype)
     np_ret = onp.logspace(*config, endpoint=endpoint, base=base, dtype=dtype)
     assert_almost_equal(mx_ret.asnumpy(), np_ret, atol=1e-3, rtol=1e-5)
     if dtype is not None:
         assert mx_out.dtype == np_out.dtype


 @use_np
 @pytest.mark.parametrize('start,end,step', [
     ([], [], None),
     ([], [], []),
     ([1], [4], None),
     ([1], [10], [3]),
     ([10], [0], [-2]),
     ([None], [None], [None]),
     ([None], [None], [-1]),
     ([10], [None], [-1]),
     ([1, 0, 3], [-2, 10, -4], [None, 2, 3]),
     ([-2, -3, -5, -6], [1, 3, 4, 5], None),
     ([-2, -3, -5, -6], [1, 3, 4, 5], [-1, -2, -3, -4]),
     ([2, -3, -5, -6], [2, 3, 4, 5], None),
     ([2, -3, -5, 5], [3, 3, 4, 5], None),
 ])
 @pytest.mark.parametrize('hybridize', [True, False])
 def test_npx_slice(start, end, step, hybridize):
     class TestSlice(HybridBlock):
         def __init__(self, begin, end, step):
             super(TestSlice, self).__init__()
             self._begin = begin
             self._end = end
             self._step = step

         def forward(self, a):
             return npx.slice(a, begin=self._begin, end=self._end, step=self._step)

     shape = (8, 16, 9, 9)
     np_array = onp.arange(onp.prod(shape), dtype='int32').reshape(shape)

     test_slice = TestSlice(begin=start, end=end, step=step)
     if hybridize:
         test_slice.hybridize()

     a = np.array(np_array, dtype=np_array.dtype)
     a.attach_grad()
     basic_index = tuple([
         slice(start[i], end[i], step[i]) if step is not None else slice(start[i], end[i])
         for i in range(len(start))
     ])
     expected_ret = np_array[basic_index]
     with mx.autograd.record():
         y = test_slice(a)

     assert same(y.asnumpy(), expected_ret)

     # test backward
     mx.autograd.backward(y)
     expected_grad = onp.zeros(shape)
     expected_grad[basic_index] = 1
     assert same(a.grad.asnumpy(), expected_grad)

 @use_np
 def test_npx_index_add():
     class TestIndexAdd(HybridBlock):
         def __init__(self):
             super(TestIndexAdd, self).__init__()

         def forward(self, a, ind, val):
             return npx.index_add(a, ind, val)

     def index_add_forward(a, ind, val, ind_ndim, ind_num):
         if val.dtype != a.dtype:
             val = val.astype(a.dtype)
         ind_arr = ind.transpose()
         if ind_arr.ndim == 0:
             ind_arr = onp.array([ind_arr])
         for i in range(ind_arr.shape[0]):
             t_ind = ind_arr[i]
             t_ind = tuple(t_ind.tolist()) if type(t_ind) is onp.ndarray else t_ind.tolist()
             if val.ndim + ind_ndim > a.ndim:
                 t_val = val[tuple([0 if val.shape[0]==1 else i])]
                 if type(t_val) is onp.ndarray and t_val.shape[0] == 1:
                     a[t_ind] += onp.squeeze(t_val, axis=0)
                 else:
                     a[t_ind] += t_val
             else:
                 a[t_ind] += val
         return a

     def index_add_bwd(out_grad, a_grad, ind, val_grad, ind_ndim, ind_num, grad_req_a, grad_req_val):
         if grad_req_a == 'add':
             init_a_grad = onp.array(a_grad)
         if grad_req_val == 'add':
             init_val_grad = onp.array(val_grad)
         a_grad = onp.zeros(a_grad.shape) + out_grad
         a_grad = a_grad.astype(a_grad.dtype)
         val_grad = onp.zeros(val_grad.shape).astype(val_grad.dtype)

         ind_arr = ind.transpose()
         if ind_arr.ndim == 0:
             ind_arr = onp.array([ind_arr])
         for i in range(ind_arr.shape[0]):
             t_ind = ind_arr[i]
             t_ind = tuple(ind_arr[i].tolist()) if type(ind_arr[i]) is onp.ndarray else ind_arr[i].tolist()
             if val_grad.ndim + ind_ndim > a_grad.ndim:
                 idx = 0 if val_grad.shape[0]==1 else i
                 t_grad = out_grad[t_ind]
                 t_grad_shape = onp.array(t_grad.shape)
                 val_grad_shape = onp.array(val_grad[idx].shape)
                 if type(val_grad[idx]) is not onp.ndarray:
                     t_grad = onp.sum(t_grad)
                 else:
                     is_not_equal = t_grad_shape - val_grad_shape
                     if onp.any(is_not_equal):
                         broadcast_dim = onp.nonzero(onp.where(is_not_equal, 1, 0))
                         t_grad = onp.sum(t_grad, axis=tuple(broadcast_dim[0].reshape(1, -1)[0]), keepdims=True)
                 val_grad[idx] += t_grad
             else:
                 t_grad = out_grad[t_ind]
                 if type(val_grad) is not onp.ndarray or val_grad.shape == ():
                     t_grad = onp.sum(t_grad)
                 else:
                     if type(t_grad) is onp.ndarray:
                         ext_dim = t_grad.ndim() - val_grad.ndim()
                         if ext_dim:
                             t_grad = onp.sum(t_grad, axis=tuple(onp.arange(ext_dim)))
                         t_grad_shape = onp.array(t_grad.shape)
                         val_grad_shape = onp.array(val_grad.shape)
                         is_not_equal = t_grad_shape - val_grad_shape
                         if onp.any(is_not_equal):
                             broadcast_dim = onp.nonzero(onp.where(is_not_equal, 1, 0))
                             t_grad = onp.sum(t_grad, axis=tuple(broadcast_dim.reshape(1, -1)[0]), keepdims=True)
                 val_grad += t_grad
         if grad_req_a == 'add':
             a_grad += init_a_grad
         if grad_req_val == 'add':
             val_grad += init_val_grad
         return a_grad, val_grad

     # a.shape, ind.shape, val.shape, ind_ndim, ind_num
     configs = [((2, ), np.array(1, dtype=onp.int32), (1, ), 1, 1)]
     shape = tuple(onp.random.randint(1, 6, size=(4))) # a.shape
     for ind_ndim in range(1, 5): # ind.shape: (ind_ndim, ind_num)
         ind_num = onp.random.randint(1, 7)
         ind = []
         for ind_dim in range(ind_ndim):
             ind.append(onp.random.randint(0, shape[ind_dim], size=(ind_num)))
         ind = onp.array(ind).astype(onp.int32)
         # case: val is scalar
         configs.append(tuple([shape, ind, (), ind_ndim, ind_num]))
         for _ in range(1, 5 - ind_ndim):
             val_shape = [1 if onp.random.randint(0, 5)==0 else ind_num]
             for val_dim in range(ind_ndim, 4):
                 val_shape.append(1 if onp.random.randint(0, 5)==0 else shape[val_dim])
             # case: val is tensor
             configs.append(tuple([shape, ind, tuple(val_shape), ind_ndim, ind_num]))

     dtypes = ['float32', 'float64', 'int32', 'int64']
     grad_req = ['write', 'null', 'add']
     for hybridize, grad_req_a, grad_req_val, dtype, indtype in \
         itertools.product([True, False], grad_req, grad_req, dtypes, ['int32', 'int64']):
         for a_shape, ind, val_shape ,ind_ndim, ind_num in configs:
             eps = 1e-3
             atype = dtype
             valtype = dtype
             test_index_add = TestIndexAdd()
             if hybridize:
                 test_index_add.hybridize()
             a = mx.nd.random.uniform(-10.0, 10.0, shape=a_shape).as_np_ndarray().astype(atype)
             a.attach_grad(grad_req=grad_req_a)
             val = mx.nd.random.uniform(-10.0, 10.0, shape=val_shape).as_np_ndarray().astype(valtype)
             val.attach_grad(grad_req=grad_req_val)
             expected_ret = index_add_forward(a.asnumpy(), ind.astype(indtype), val.asnumpy(), ind_ndim, ind_num)
             with mx.autograd.record():
                 mx_ret = test_index_add(a, np.array(ind).astype(indtype), val)
             assert mx_ret.shape == a.shape
             assert expected_ret.shape == a.shape
             assert mx_ret.dtype == a.dtype
             assert expected_ret.dtype == a.dtype
             assert_almost_equal(mx_ret.asnumpy(), expected_ret, rtol=eps, atol=eps)

             if atype not in ['float16', 'float32', 'float64'] or valtype not in ['float16', 'float32', 'float64']:
                 continue
             if grad_req_a != 'null' or grad_req_val != 'null':
                 init_a_grad = mx.nd.random.uniform(-10.0, 10.0, shape=a_shape).as_np_ndarray().astype(atype)
                 init_val_grad = mx.nd.random.uniform(-10.0, 10.0, shape=val_shape).as_np_ndarray().astype(valtype)
                 out_grad = mx.nd.random.uniform(-10.0, 10.0, shape=a_shape).as_np_ndarray().astype(atype)
                 if grad_req_a == 'add':
                     if init_a_grad.ndim == 0:
                         a.grad[()] = init_a_grad.item()
                     else:
                         a.grad[:] = init_a_grad
                 if grad_req_val == 'add':
                     if init_val_grad.ndim == 0:
                         val.grad[()] = init_val_grad.item()
                     else:
                         val.grad[:] = init_val_grad
                 mx_ret.backward(out_grad)
                 expected_bwd_a, expected_bwd_val = index_add_bwd(out_grad.asnumpy(), init_a_grad.asnumpy(), ind,
                                                                  init_val_grad.asnumpy(), ind_ndim, ind_num,
                                                                  grad_req_a, grad_req_val)
                 if grad_req_a == 'null':
                     assert a.grad is None
                 else:
                     assert_almost_equal(a.grad.asnumpy(), expected_bwd_a, rtol = eps, atol=eps)
                 if grad_req_val == 'null':
                     assert val.grad is None
                 else:
                     assert_almost_equal(val.grad.asnumpy(), expected_bwd_val, rtol = eps, atol=eps)

             mx_out = npx.index_add(a, np.array(ind).astype(indtype), val)
             assert_almost_equal(mx_out.asnumpy(), expected_ret, rtol=eps, atol=eps)


 @use_np
 def test_npx_index_update():
     class TestIndexUpdate(HybridBlock):
         def __init__(self):
             super(TestIndexUpdate, self).__init__()

         def forward(self, a, ind, val):
             return npx.index_update(a, ind, val)

     def check_index_update_forward(mx_ret, a, ind, val, ind_ndim, ind_num, eps):
         if val.dtype != a.dtype:
             val = val.astype(a.dtype)
         ind_arr = ind.transpose()
         if ind_arr.ndim == 0:
             ind_arr = onp.array([ind_arr])
         for i in range(ind_arr.shape[0]):
             t_ind = ind_arr[i]
             t_ind = tuple(t_ind.tolist()) if type(t_ind) is onp.ndarray else t_ind.tolist()
             if val.ndim + ind_ndim > a.ndim:
                 t_val = val[tuple([0 if val.shape[0]==1 else i])]
                 if type(t_val) is onp.ndarray and t_val.shape[0] == 1:
                     expect_tmp = onp.squeeze(t_val, axis=0)
                 else:
                     expect_tmp = t_val
             else:
                 expect_tmp = val
             mx_tmp = mx_ret[t_ind]
             close_pos = onp.where(onp.isclose(expect_tmp, mx_tmp, rtol=eps, atol=eps))
             if a[t_ind].ndim == 0:
                 if close_pos[0].size == 1:
                     mx_ret[t_ind] = 0
                     a[t_ind] = 0
             else:
                 mx_ret[t_ind][close_pos] = 0
                 a[t_ind][close_pos] = 0
         assert_almost_equal(mx_ret, a, rtol=eps, atol=eps)

     def index_update_bwd(out_grad, a_grad, ind, val_grad, ind_ndim, ind_num, grad_req_a, grad_req_val):
         if grad_req_a == 'add':
             init_a_grad = onp.array(a_grad)
         if grad_req_val == 'add':
             init_val_grad = onp.array(val_grad)
         a_grad = onp.zeros(a_grad.shape) + out_grad
         a_grad = a_grad.astype(a_grad.dtype)
         val_grad = onp.zeros(val_grad.shape).astype(val_grad.dtype)

         ind_arr = ind.transpose()
         if ind_arr.ndim == 0:
             ind_arr = onp.array([ind_arr])
         for i in range(ind_arr.shape[0]):
             t_ind = ind_arr[i]
             t_ind = tuple(ind_arr[i].tolist()) if type(ind_arr[i]) is onp.ndarray else ind_arr[i].tolist()
             a_grad[t_ind] = 0
             if val_grad.ndim + ind_ndim > a_grad.ndim:
                 idx = 0 if val_grad.shape[0]==1 else i
                 t_grad = out_grad[t_ind]
                 t_grad_shape = onp.array(t_grad.shape)
                 val_grad_shape = onp.array(val_grad[idx].shape)
                 if type(val_grad[idx]) is not onp.ndarray:
                     t_grad = onp.sum(t_grad)
                 else:
                     is_not_equal = t_grad_shape - val_grad_shape
                     if onp.any(is_not_equal):
                         broadcast_dim = onp.nonzero(onp.where(is_not_equal, 1, 0))
                         t_grad = onp.sum(t_grad, axis=tuple(broadcast_dim[0].reshape(1, -1)[0]), keepdims=True)
                 val_grad[idx] += t_grad
             else:
                 t_grad = out_grad[t_ind]
                 if type(val_grad) is not onp.ndarray or val_grad.shape == ():
                     t_grad = onp.sum(t_grad)
                 else:
                     if type(t_grad) is onp.ndarray:
                         ext_dim = t_grad.ndim() - val_grad.ndim()
                         if ext_dim:
                             t_grad = onp.sum(t_grad, axis=tuple(onp.arange(ext_dim)))
                         t_grad_shape = onp.array(t_grad.shape)
                         val_grad_shape = onp.array(val_grad.shape)
                         is_not_equal = t_grad_shape - val_grad_shape
                         if onp.any(is_not_equal):
                             broadcast_dim = onp.nonzero(onp.where(is_not_equal, 1, 0))
                             t_grad = onp.sum(t_grad, axis=tuple(broadcast_dim.reshape(1, -1)[0]), keepdims=True)
                 val_grad += t_grad
         if grad_req_a == 'add':
             a_grad += init_a_grad
         if grad_req_val == 'add':
             val_grad += init_val_grad
         return a_grad, val_grad

     # a.shape, ind.shape, val.shape, ind_ndim, ind_num
     configs = [((2, ), np.array(1, dtype=onp.int32), (1, ), 1, 1)]
     shape = tuple(onp.random.randint(1, 6, size=(4))) # a.shape
     for ind_ndim in range(1, 5): # ind.shape: (ind_ndim, ind_num)
         ind_num = onp.random.randint(1, 7)
         ind = []
         for ind_dim in range(ind_ndim):
             ind.append(onp.random.randint(0, shape[ind_dim], size=(ind_num)))
         ind = onp.array(ind).astype(onp.int32)
         # case: val is scalar
         configs.append(tuple([shape, ind, (), ind_ndim, ind_num]))
         for _ in range(1, 5 - ind_ndim):
             val_shape = [1 if onp.random.randint(0, 5)==0 else ind_num]
             for val_dim in range(ind_ndim, 4):
                 val_shape.append(1 if onp.random.randint(0, 5)==0 else shape[val_dim])
             # case: val is tensor
             configs.append(tuple([shape, ind, tuple(val_shape), ind_ndim, ind_num]))

     dtypes = ['float32', 'float64', 'int32', 'int64']
     grad_req = ['write', 'null', 'add']
     for hybridize, grad_req_a, grad_req_val, dtype, indtype in \
         itertools.product([True, False], grad_req, grad_req, dtypes, ['int32', 'int64']):
         for a_shape, ind, val_shape ,ind_ndim, ind_num in configs:
             eps = 1e-3
             atype = dtype
             valtype = dtype
             test_index_update = TestIndexUpdate()
             if hybridize:
                 test_index_update.hybridize()
             a = mx.nd.random.uniform(-10.0, 10.0, shape=a_shape).as_np_ndarray().astype(atype)
             a.attach_grad(grad_req=grad_req_a)
             val = mx.nd.random.uniform(-10.0, 10.0, shape=val_shape).as_np_ndarray().astype(valtype)
             val.attach_grad(grad_req=grad_req_val)
             with mx.autograd.record():
                 mx_ret = test_index_update(a, np.array(ind).astype(indtype), val)
             assert mx_ret.shape == a.shape
             assert mx_ret.dtype == a.dtype
             check_index_update_forward(mx_ret.asnumpy(), a.asnumpy(), ind.astype(indtype), val.asnumpy(), ind_ndim, ind_num, eps)

             if atype not in ['float16', 'float32', 'float64'] or valtype not in ['float16', 'float32', 'float64']:
                 continue
             if grad_req_a != 'null' or grad_req_val != 'null':
                 init_a_grad = mx.nd.random.uniform(-10.0, 10.0, shape=a_shape).as_np_ndarray().astype(atype)
                 init_val_grad = mx.nd.random.uniform(-10.0, 10.0, shape=val_shape).as_np_ndarray().astype(valtype)
                 out_grad = mx.nd.random.uniform(-10.0, 10.0, shape=a_shape).as_np_ndarray().astype(atype)
                 if grad_req_a == 'add':
                     if init_a_grad.ndim == 0:
                         a.grad[()] = init_a_grad.item()
                     else:
                         a.grad[:] = init_a_grad
                 if grad_req_val == 'add':
                     if init_val_grad.ndim == 0:
                         val.grad[()] = init_val_grad.item()
                     else:
                         val.grad[:] = init_val_grad
                 mx_ret.backward(out_grad)
                 expected_bwd_a, expected_bwd_val = index_update_bwd(out_grad.asnumpy(), init_a_grad.asnumpy(), ind,
                                                                     init_val_grad.asnumpy(), ind_ndim, ind_num,
                                                                     grad_req_a, grad_req_val)

                 if grad_req_a == 'null':
                     assert a.grad is None
                 else:
                     assert_almost_equal(a.grad.asnumpy(), expected_bwd_a, rtol = eps, atol=eps)
                 if grad_req_val == 'null':
                     assert val.grad is None
                 else:
                     assert_almost_equal(val.grad.asnumpy(), expected_bwd_val, rtol = eps, atol=eps)

             mx_out = npx.index_update(a, np.array(ind).astype(indtype), val)
             check_index_update_forward(mx_out.asnumpy(), a.asnumpy(), ind.astype(indtype), val.asnumpy(), ind_ndim, ind_num, eps)


 @use_np
 def test_npx_batch_dot():
     device = mx.device.current_device()
     dtypes = ['float32', 'float64']
     if device.device_type == 'gpu':
         dtypes += ['float16']
     eps_dict = {'float32': 1E-4, 'float64': 1E-4, 'float16': 1E-3}
     class TestBatchDot(HybridBlock):
         def __init__(self, transpose_a, transpose_b):
             super(TestBatchDot, self).__init__()
             self._transpose_a = transpose_a
             self._transpose_b = transpose_b

         def forward(self, lhs, rhs):
             return npx.batch_dot(lhs, rhs,
                                    transpose_a=self._transpose_a,
                                    transpose_b=self._transpose_b)

     def batch_dot_numpy(lhs, rhs, transpose_a, transpose_b):
         assert lhs.ndim == rhs.ndim >= 3
         if transpose_a:
             lhs = lhs.swapaxes(-1, -2)
         if transpose_b:
             rhs = rhs.swapaxes(-1, -2)
         return onp.matmul(lhs, rhs)

     def gt_grad_batch_dot_numpy(lhs, rhs, ograd, transpose_a, transpose_b, lhs_req, rhs_req,
                                 init_lhs_grad, init_rhs_grad):

         if transpose_a and transpose_b:
             # Gradient of z = dot(x.T, y.T)
             # dx = dot(dz, y).T = dot(y.T, dz.T)
             # dy = dot(x, dz).T = dot(dz.T, x.T)
             lhs_grad = batch_dot_numpy(rhs, ograd, transpose_a=True, transpose_b=True)
             rhs_grad = batch_dot_numpy(ograd, lhs, transpose_a=True, transpose_b=True)
         elif not transpose_a and transpose_b:
             # Gradient of z = dot(x, y.T)
             # dx = dot(dz, y)
             # dy = dot(x.T, dz).T = dot(dz.T, x)
             lhs_grad = batch_dot_numpy(ograd, rhs, transpose_a=False, transpose_b=False)
             rhs_grad = batch_dot_numpy(ograd, lhs, transpose_a=True, transpose_b=False)
         elif transpose_a and not transpose_b:
             # Gradient of z = dot(x.T, y)
             # dx = dot(dz, y.T).T = dot(y, dz.T)
             # dy = dot(x, dz)
             lhs_grad = batch_dot_numpy(rhs, ograd, transpose_a=False, transpose_b=True)
             rhs_grad = batch_dot_numpy(lhs, ograd, transpose_a=False, transpose_b=False)
         else:
             # Gradient of z = dot(x, y)
             # dx = dot(dz, y.T)
             # dy = dot(x.T, dz)
             lhs_grad = batch_dot_numpy(ograd, rhs, transpose_a=False, transpose_b=True)
             rhs_grad = batch_dot_numpy(lhs, ograd, transpose_a=True, transpose_b=False)
         if lhs_req == 'add':
             lhs_grad += init_lhs_grad
         if rhs_req == 'add':
             rhs_grad += init_rhs_grad
         return lhs_grad, rhs_grad


     configs = [
         ((2, 3, 0), (2, 4, 0), False, True),
         ((2, 4, 3), (2, 4, 3), True, False),
         ((0, 3, 0), (0, 0, 2), False, False),
         ((3, 2, 3, 2), (3, 2, 2, 3), True, True),
         ((3, 1, 5, 2), (3, 1, 2, 1), False, False)
     ]
     bad_configs = [
         ((5, 3, 2), (5, 1, 3), False, False),
         ((2, 5, 3, 1), (2, 4, 3, 1), True, False)
     ]
     for hybridize in [True, False]:
         for lhs_shape, rhs_shape, transpose_a, transpose_b in configs:
             for dtype in dtypes:
                 eps = eps_dict[dtype]
                 for lhs_grad_req in ['write', 'add']:
                     for rhs_grad_req in ['write', 'add']:
                         f_batch_dot = TestBatchDot(transpose_a=transpose_a,
                                                    transpose_b=transpose_b)
                         if hybridize:
                             f_batch_dot.hybridize()
                         lhs_val = mx.np.array(onp.random.uniform(-1.0, 1.0, lhs_shape), dtype=dtype)
                         rhs_val = mx.np.array(onp.random.uniform(-1.0, 1.0, rhs_shape), dtype=dtype)
                         lhs_val.attach_grad(grad_req=lhs_grad_req)
                         rhs_val.attach_grad(grad_req=rhs_grad_req)
                         gt_out = batch_dot_numpy(lhs_val.asnumpy(), rhs_val.asnumpy(),
                                                  transpose_a, transpose_b)
                         init_lhs_grad = mx.np.random.uniform(-1.0, 1.0, lhs_shape, dtype=dtype)
                         init_rhs_grad = mx.np.random.uniform(-1.0, 1.0, rhs_shape, dtype=dtype)
                         o_grad = mx.np.random.uniform(-1.0, 1.0, gt_out.shape, dtype=dtype)
                         if lhs_grad_req == 'add':
                             lhs_val.grad[:] = init_lhs_grad
                         if rhs_grad_req == 'add':
                             rhs_val.grad[:] = init_rhs_grad
                         with mx.autograd.record():
                             out = f_batch_dot(lhs_val, rhs_val)
                         out.backward(o_grad)
                         assert_almost_equal(out.asnumpy(), gt_out, rtol=eps, atol=eps)
                         gt_lhs_grad, gt_rhs_grad = gt_grad_batch_dot_numpy(lhs_val.asnumpy(),
                                                               rhs_val.asnumpy(),
                                                               o_grad.asnumpy(),
                                                               transpose_a=transpose_a,
                                                               transpose_b=transpose_b,
                                                               lhs_req=lhs_grad_req,
                                                               rhs_req=rhs_grad_req,
                                                               init_lhs_grad=init_lhs_grad.asnumpy(),
                                                               init_rhs_grad=init_rhs_grad.asnumpy())
                         assert_almost_equal(lhs_val.grad.asnumpy(), gt_lhs_grad, rtol=eps, atol=eps)
                         assert_almost_equal(rhs_val.grad.asnumpy(), gt_rhs_grad, rtol=eps, atol=eps)
     for lhs_shape, rhs_shape, transpose_a, transpose_b in bad_configs:
         for dtype in dtypes:
             lhs_val = mx.np.array(onp.random.uniform(-1.0, 1.0, lhs_shape), dtype=dtype)
             rhs_val = mx.np.array(onp.random.uniform(-1.0, 1.0, rhs_shape), dtype=dtype)
             pytest.raises(MXNetError, lambda: mx.npx.batch_dot(lhs_val, rhs_val,
                                                                transpose_a=transpose_a,
                                                                transpose_b=transpose_b))


 @use_np
 @pytest.mark.parametrize('shape', [(4, 2), (4, 3, 4),
     (4, 6, 4, 5), (4, 5, 6, 4, 5)])
 @pytest.mark.parametrize('fix_gamma', [False, True])
 @pytest.mark.parametrize('cudnn_off', [False, True])
 @pytest.mark.parametrize('output_mean_var', [False, True])
 @pytest.mark.flaky
 def test_npx_batch_norm(shape, fix_gamma, cudnn_off, output_mean_var):
     momentum = 0.9
     epsilon = 1e-5
     class TestBatchNorm(HybridBlock):
         def __init__(self, eps=1e-5, fix_gamma=False, momentum=0.9, **kwargs):
             super().__init__()
             self.eps = eps
             self.fix_gamma = fix_gamma
             self.momentum = momentum
             self.kwargs = kwargs
         def forward(self, data, bn_gamma, bn_beta,
                            bn_running_mean, bn_running_var):
             op = npx.batch_norm
             output = op(data, bn_gamma, bn_beta,
                         bn_running_mean, bn_running_var,
                         momentum=self.momentum, eps=self.eps,
                         fix_gamma=self.fix_gamma, **self.kwargs)
             return output

     def _test_batchnorm_impl(axis,
                              data_grad_req, gamma_grad_req, beta_grad_req):
         kwargs = dict(output_mean_var=output_mean_var)
         kwargs.update(dict(axis=axis, cudnn_off=cudnn_off))
         op = TestBatchNorm(eps=epsilon, fix_gamma=fix_gamma, momentum=momentum, **kwargs)
         nch = shape[axis]

         if not fix_gamma:
             bn_gamma = np.random.uniform(size=(nch,))
             bn_gamma.attach_grad(grad_req=gamma_grad_req)
         else:
             bn_gamma = np.ones((nch,))

         bn_beta = np.random.uniform(size=(nch,))
         bn_beta.attach_grad(grad_req=beta_grad_req)

         bn_running_mean = np.zeros(nch)
         bn_running_var = np.ones(nch)

         running_mean = np.zeros(nch)
         running_var = np.ones(nch)
         num_iters = 10
         expand_shape = [1] * len(shape)
         expand_shape[axis] = shape[axis]
         expand_shape = tuple(expand_shape)
         data = np.random.uniform(size=shape)
         data.attach_grad(grad_req=data_grad_req)
         adX, adW, adb = 0, 0, 0
         is_train = data_grad_req != 'null' or \
             (not fix_gamma and gamma_grad_req != 'null') or \
             beta_grad_req != 'null'
         for _ in range(num_iters):
             if data_grad_req != 'add':
                 data = np.random.uniform(size=shape)
                 data.attach_grad(grad_req=data_grad_req)
             ograd = np.random.uniform(size=shape)
             with mx.autograd.record():
                 output = op(data, bn_gamma, bn_beta,
                             bn_running_mean, bn_running_var)
                 if output_mean_var:
                     output, output_mean, output_std = output
                 if is_train:
                     output.backward(ograd)
             mx.nd.waitall()

             assert 0 <= axis < data.ndim
             reduce_axis = tuple(i for i in range(data.ndim) if i != axis)
             assert len(reduce_axis) == data.ndim - 1
             data_mean = data.mean(
                 axis=reduce_axis, keepdims=True)
             data_var = ((data - data_mean) ** 2).mean(axis=reduce_axis,
                                                         keepdims=True)

             target_output = (data - data_mean) / \
                 np.sqrt(data_var + epsilon) * \
                 bn_gamma.reshape(expand_shape) + \
                 bn_beta.reshape(expand_shape)

             # squeeze data_mean and data_var
             data_mean_flat = data_mean.squeeze()
             data_var_flat = data_var.squeeze()

             running_mean = running_mean * momentum + \
                 data_mean_flat * (1 - momentum)

             m = onp.prod(shape) / shape[axis]
             # cudnn uses m-1 in the denominator of its sample variance calculation, not m
             sample_var_adjust = 1.0 if cudnn_off or fix_gamma else m / (m-1)
             running_var = running_var * momentum + \
                 data_var_flat * sample_var_adjust * (1 - momentum)

             W = bn_gamma.reshape(expand_shape)
             dnx = ograd * W
             xsm = data - data_mean
             nd = 1.0 / np.sqrt(data_var + epsilon)
             nx = xsm * nd
             dvar = (dnx * xsm).sum(axis=reduce_axis, keepdims=True,
                                   ) * (-0.5) * np.power(nd, 3)
             dmean = -nd * dnx.sum(axis=reduce_axis, keepdims=True) - \
                 dvar * xsm.mean(axis=reduce_axis, keepdims=True,
                                 ) * 2.0
             dX = dnx * nd + dvar * xsm * (2.0 / m) + dmean * (1.0 / m)
             dW = (ograd * nx).sum(axis=reduce_axis)
             db = ograd.sum(axis=reduce_axis)
             adX = dX if data_grad_req != 'add' else adX + dX
             adW = dW if gamma_grad_req != 'add' else adW + dW
             adb = db if beta_grad_req != 'add' else adb + db

             atol, rtol = 5e-2, 5e-2

             if output_mean_var:
                 assert_almost_equal(output_mean.asnumpy(),
                                     data_mean_flat.asnumpy(),
                                     atol=atol, rtol=rtol)
                 assert_almost_equal(output_std.asnumpy(),
                                     (1.0 / np.sqrt(data_var_flat +
                                             epsilon)).asnumpy(),
                                     atol=atol, rtol=rtol)
             assert_almost_equal(output.asnumpy(), target_output.asnumpy(),
                                 atol=atol, rtol=rtol)
             if is_train:
                 assert_almost_equal(bn_running_mean.asnumpy(
                 ), running_mean.asnumpy(), atol=atol, rtol=rtol)
                 assert_almost_equal(bn_running_var.asnumpy(
                 ), running_var.asnumpy(), atol=atol, rtol=rtol)

             if data_grad_req != 'null':
                 assert_almost_equal(data.grad.asnumpy(),
                                     adX.asnumpy(), atol=atol, rtol=rtol)
             if not fix_gamma:
                 if gamma_grad_req != 'null':
                     assert_almost_equal(
                         bn_gamma.grad.asnumpy(), adW.asnumpy(),
                         atol=atol, rtol=rtol)
             else:
                 assert((bn_gamma.asnumpy() == 1).all())
             if beta_grad_req != 'null':
                 assert_almost_equal(
                     bn_beta.grad.asnumpy(), adb.asnumpy(), atol=atol, rtol=rtol)

     grad_reqs = ['write'] if len(shape) != 4 else ['null', 'write', 'add']
     for data_grad_req in grad_reqs:
         for gamma_grad_req in grad_reqs:
             if fix_gamma and gamma_grad_req != 'null':
                 continue
             for beta_grad_req in grad_reqs:
                 for axis in range(len(shape)):
                     _test_batchnorm_impl(axis,
                         data_grad_req, gamma_grad_req, beta_grad_req)


 def np_softmax(x, axis=-1):
     if (x.shape[axis] == 0):
         return onp.sum(x, axis=axis, keepdims=True)
     x = x - onp.max(x, axis=axis, keepdims=True)
     x = onp.exp(x)
     x /= onp.sum(x, axis=axis, keepdims=True)
     return x

 def np_log_softmax(x, axis=-1):
     return onp.log(np_softmax(x, axis))

 @use_np
 def test_npx_softmax():
     class TestSoftmax(HybridBlock):
         def __init__(self, axis):
             super(TestSoftmax, self).__init__()
             self._axis = axis

         def forward(self, a):
             return npx.softmax(a, axis=axis)

     class TestLogSoftmax(HybridBlock):
         def __init__(self, axis):
             super(TestLogSoftmax, self).__init__()
             self._axis = axis

         def forward(self, a):
             return npx.log_softmax(a, axis=axis)


     #(operator, function) tuples
     tested_ops = [(TestSoftmax, np_softmax),
                   (TestLogSoftmax, np_log_softmax)]

     # only testing 0-size shaped inputs here, other input cases have been tested in test_opeartor.py
     for SoftmaxOp, softmax_function in tested_ops:
         for hybridize in [True, False]:
             for shape in [(3, 0, 4), (0, 0)]:
                 mx_a = np.random.uniform(size=shape)
                 mx_a.attach_grad()
                 for axis in range(-len(shape), len(shape)):
                     test_softmax_op = SoftmaxOp(axis)
                     if hybridize:
                         test_softmax_op.hybridize()

                     with mx.autograd.record():
                         mx_out = test_softmax_op(mx_a)

                     mx_out.wait_to_read()

                     np_out = softmax_function(mx_a.asnumpy(), axis)
                     assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5, equal_nan=True)

                     mx_out.backward()
                     mx_a.grad.wait_to_read()
                     assert_almost_equal(mx_a.grad.asnumpy(), onp.zeros(shape), rtol=1e-3, atol=1e-5)


 def np_masked_softmax(data, mask, axis=-1, temperature=1.0):
     neg = -1e18
     if data.dtype == onp.float16:
         neg = -1e4
     temp = onp.where(mask, data, neg)
     result = (np_softmax(temp, axis=axis) / temperature) * mask
     return result

 def np_masked_log_softmax(data, mask, axis=-1, temperature=1.0):
     neg = -1e18
     if data.dtype == onp.float16:
         neg = -1e4
     data = onp.where(mask, data, neg)
     return onp.where(mask, np_log_softmax(data, axis=axis) / temperature, -onp.inf)

 @use_np
 @pytest.mark.parametrize('hybridize', [True, False])
 @pytest.mark.parametrize('shape', [(3, 0, 4), (0, 0)])
 def test_npx_masked_softmax(hybridize, shape):
     class TestMaskedSoftmax(HybridBlock):
         def __init__(self, axis):
             super(TestMaskedSoftmax, self).__init__()
             self._axis = axis

         def forward(self, a, mask):
             return npx.masked_softmax(a, mask, axis=self._axis)

     class TestMaskedLogSoftmax(HybridBlock):
         def __init__(self, axis):
             super(TestMaskedLogSoftmax, self).__init__()
             self._axis = axis

         def forward(self, a, mask):
             return npx.masked_log_softmax(a, mask, axis=self._axis)

     #(operator, function) tuples
     tested_ops = [(TestMaskedSoftmax, np_masked_softmax),
                   (TestMaskedLogSoftmax, np_masked_log_softmax)]

     # only testing 0-size shaped inputs here, other input cases have been tested in test_opeartor.py
     for SoftmaxOp, softmax_function in tested_ops:
         mx_a = np.random.uniform(size=shape)
         mask = np.random.randint(0, 2, shape)
         mx_a.attach_grad()
         mask.attach_grad()
         for axis in range(-len(shape), len(shape)):
             test_softmax_op = SoftmaxOp(axis)
             if hybridize:
                 test_softmax_op.hybridize()

             with mx.autograd.record():
                 mx_out = test_softmax_op(mx_a, mask)

             mx_out.wait_to_read()

             np_out = softmax_function(mx_a.asnumpy(), mask.asnumpy(), axis)
             assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5, equal_nan=True)


 @use_np
 def test_npi_boolean_assign():
     class TestBooleanAssignScalar(HybridBlock):
         def __init__(self, val, start_axis):
             super(TestBooleanAssignScalar, self).__init__()
             self._val = val
             self._start_axis = start_axis

         def forward(self, a, mask):
             return _npi.boolean_mask_assign_scalar(a, mask, self._val, start_axis=self._start_axis, out=a)

     class TestBooleanAssignTensor(HybridBlock):
         def __init__(self, start_axis):
             super(TestBooleanAssignTensor, self).__init__()
             self._start_axis = start_axis

         def forward(self, a, mask, value):
             return _npi.boolean_mask_assign_tensor(a, mask, value, start_axis=self._start_axis, out=a)

     configs = [
         ((3, 4), (3, 4), 0),
         ((3, 0), (3, 0), 0),
         ((), (), 0),
         ((2, 3, 4, 5), (2, 3), 0),
         ((2, 3, 4, 5), (3, 4), 1),
         ((2, 3, 4, 5), (4, 5), 2),
     ]

     for hybridize in [False]:
         for config in configs:
             dshape, mshape, start_axis = config
             test_data = np.random.uniform(size=dshape)
             valid_num = 0
             while valid_num == 0:
                 mx_mask = np.random.choice(np.array([False, True], dtype=np.bool), size=mshape)
                 if test_data.size == 0:
                     break
                 valid_num = int(mx_mask.asnumpy().sum())
             np_mask = mx_mask.asnumpy().astype(onp.bool)
             vshape = []
             vshape_broadcast = []
             for i in range(len(dshape)):
                 if i < start_axis:
                     vshape.append(dshape[i])
                     vshape_broadcast.append(dshape[i])
                 elif i == start_axis:
                     vshape.append(valid_num)
                     vshape_broadcast.append(1)
                 elif i >= start_axis + len(mshape):
                     vshape.append(dshape[i])
                     vshape_broadcast.append(dshape[i])
             vshape_broadcast = tuple(vshape_broadcast)
             for val in [42.0, onp.array(42.), onp.array([42.]), onp.random.uniform(size=vshape), onp.random.uniform(size=vshape_broadcast)]:
                 mx_val = val if isinstance(val, float) else np.array(val, dtype=np.float32)
                 test_block = TestBooleanAssignScalar(val, start_axis) if isinstance(val, float) else TestBooleanAssignTensor(start_axis)
                 if hybridize:
                     test_block.hybridize()
                 np_data = test_data.asnumpy()
                 mx_data1 = test_data.copy()
                 mx_data2 = test_data.copy()
                 trailing_axis = len(np_data.shape) - len(np_mask.shape) - start_axis
                 if start_axis == 0:
                     if trailing_axis == 0:
                         np_data[np_mask] = val
                         mx_data1[mx_mask] = mx_val
                     elif trailing_axis == 1:
                         np_data[np_mask, :] = val
                         mx_data1[mx_mask, :] = mx_val
                     elif trailing_axis == 2:
                         np_data[np_mask, :, :] = val
                         mx_data1[mx_mask, :, :] = mx_val
                 elif start_axis == 1:
                     if trailing_axis == 0:
                         np_data[:, np_mask] = val
                         mx_data1[:, mx_mask] = mx_val
                     elif trailing_axis == 1:
                         np_data[:, np_mask, :] = val
                         mx_data1[:, mx_mask, :] = mx_val
                 elif start_axis == 2:
                     if trailing_axis == 0:
                         np_data[:, :, np_mask] = val
                         mx_data1[:, :, mx_mask] = mx_val
                 mx_data1 = test_block(mx_data2, mx_mask) if isinstance(val, float) else test_block(mx_data2, mx_mask, mx_val)
                 assert_almost_equal(mx_data1.asnumpy(), np_data, rtol=1e-3, atol=1e-5, use_broadcast=False)
                 assert_almost_equal(mx_data2.asnumpy(), np_data, rtol=1e-3, atol=1e-5, use_broadcast=False)


 @use_np
 def test_np_reshape():
     class TestReshape(HybridBlock):
         def __init__(self, newshape):
             super(TestReshape, self).__init__()
             self._newshape = newshape

         def forward(self, a):
             return np.reshape(a, self._newshape)

     shape_pairs = [((2, 6), (6, 2)), ((2, 6), (3, 4)), ((1, 0), (0,)), ((0, 0), (0,)), ((), (1, 1, 1))]
     for hybridize in [True, False]:
         for shape_pair in shape_pairs:
             shape1, shape2 = shape_pair
             test_reshape = TestReshape(shape2)
             if hybridize:
                 test_reshape.hybridize()
             x = rand_ndarray(shape1).as_np_ndarray()
             x.attach_grad()
             np_out = onp.reshape(x.asnumpy(), shape2)
             with mx.autograd.record():
                 mx_out = test_reshape(x)
             assert mx_out.shape == np_out.shape
             assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5, use_broadcast=False)
             mx_out.backward()
             np_backward = onp.ones(shape1)
             assert_almost_equal(x.grad.asnumpy(), np_backward, rtol=1e-3, atol=1e-5, use_broadcast=False)

             mx_out = np.reshape(x, shape2)
             np_out = onp.reshape(x.asnumpy(), shape2)
             assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5, use_broadcast=False)


 @use_np
 @pytest.mark.parametrize('descending', [True, False])
 @pytest.mark.parametrize('shape', [
     (),
     (2, 3),
     (1, 0, 2),
 ])
 @pytest.mark.parametrize('hybrid', [False, True])
 def test_np_argsort(descending, shape, hybrid):
     class TestArgsort(HybridBlock):
         def __init__(self, axis, descending):
             super(TestArgsort, self).__init__()
             self._axis = axis
             self._descending = descending

         def forward(self, x):
             return np.argsort(x, axis=self._axis, descending=self._descending)

     data = np.random.uniform(size=shape)
     np_data = data.asnumpy()
     for axis in [None] + [i for i in range(-len(shape), len(shape))]:
         if descending:
             np_out = onp.argsort(-1 * np_data, axis)
         else:
             np_out = onp.argsort(np_data, axis)

         test_argsort = TestArgsort(axis, descending)

         if hybrid:
             test_argsort.hybridize()
         mx_out = test_argsort(data)
         assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-5, atol=1e-6, use_broadcast=False)

         mx_out = np.argsort(data, axis, descending)
         assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-5, atol=1e-6, use_broadcast=False)


 @use_np
 @pytest.mark.parametrize('descending', [True, False])
 @pytest.mark.parametrize('shape', [
     (),
     (1,),
     (5,),
     (4, 3),
     (3, 5),
     (4, 4),
     (4, 5),
     (5, 5),
     (5, 6),
     (6, 6),
     (0, 1),
     (6, 5, 6),
     (2, 3, 3, 4),
     (4, 2, 1, 2),
     (0, 5, 3, 3),
     (5, 0, 3, 3),
     (3, 3, 0, 0),
 ])
 @pytest.mark.parametrize('dtype', [np.int8, np.uint8, np.int32, np.int64, np.float32, np.float64])
 @pytest.mark.parametrize('hybridize', [True, False])
 def test_np_sort(shape, dtype, hybridize, descending):
     class TestSort(HybridBlock):
         def __init__(self, axis, descending):
             super(TestSort, self).__init__()
             self._axis = axis
             self._descending = descending

         def forward(self, x):
             return np.sort(x, self._axis, descending=self._descending)

     a = np.random.uniform(low=0, high=100, size=shape, dtype='float64').astype(dtype)
     axis_list = list(range(len(shape)))
     axis_list.append(None)
     axis_list.append(-1)
     for axis in axis_list:
         test = TestSort(axis, descending)
         if hybridize:
             test.hybridize()
         if axis == -1 and len(shape)==0:
             continue
         ret = test(a)
         if descending:
             expected_ret = -onp.sort(-1 * a.asnumpy(), axis)
         else:
             expected_ret = onp.sort(a.asnumpy(), axis)
         assert_almost_equal(ret.asnumpy(), expected_ret, atol=1e-5, rtol=1e-5, use_broadcast=False)

         # check imperative again
         ret = np.sort(a, axis=axis, descending=descending)
         assert_almost_equal(ret.asnumpy(), expected_ret, atol=1e-5, rtol=1e-5, use_broadcast=False)


 @use_np
 def test_np_squeeze():
     config = [((), None),
               ((), -1),
               ((), 0),
               ((4, 1, 2), None),
               ((1, 1, 1), None),
               ((1, 0, 1, 5), 2),
               ((1, 0, 1, 1), (-1, -4))]

     class TestSqueeze(HybridBlock):
         def __init__(self, axis):
             super(TestSqueeze, self).__init__()
             self._axis = axis

         def forward(self, x):
             return np.squeeze(x, self._axis)

     for shape, axis in config:
         data_np = onp.random.uniform(size=shape)
         data_mx = np.array(data_np, dtype=data_np.dtype)
         ret_np = onp.squeeze(data_np, axis)
         ret_mx = np.squeeze(data_mx, axis)
         assert_almost_equal(ret_mx.asnumpy(), ret_np, rtol=1e-5, atol=1e-6, use_broadcast=False)

         net = TestSqueeze(axis)
         for hybrid in [False, True]:
             if hybrid:
                 net.hybridize()
             data_mx.attach_grad()
             with mx.autograd.record():
                 ret_mx = net(data_mx)
             assert_almost_equal(ret_mx.asnumpy(), ret_np, rtol=1e-5, atol=1e-6, use_broadcast=False)
             ret_mx.backward()
             assert_almost_equal(data_mx.grad.asnumpy(), onp.ones_like(data_np),
                                 rtol=1e-5, atol=1e-6, use_broadcast=False)


 @xfail_when_nonstandard_decimal_separator
 @use_np
 def test_np_tri():
     class TestTri(HybridBlock):
         def __init__(self, N, M=None, k=0, dtype=None):
             super(TestTri, self).__init__()
             self._N = N
             self._M = M
             self._k = k
             self._dtype = dtype

         def forward(self, x):
             return x + np.tri(self._N, self._M, self._k, self._dtype)

     dtypes = ['float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8', None]
     hybrids = [False, True]

     for dtype, hybrid in itertools.product(dtypes, hybrids):
         N = random.randint(2,6)
         M = random.randint(2,6)
         k = random.randint(-M*2, N*2)

         test_tri = TestTri(N, M, k, dtype)
         if hybrid:
             test_tri.hybridize()
         np_out = np.tri(N, M, k, dtype)
         x = np.zeros(shape=(), dtype=dtype)
         mx_out = test_tri(x)
         assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-5, atol=1e-6, use_broadcast=False)

         mx_out = np.tri(N, M, k, dtype)
         assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-5, atol=1e-6, use_broadcast=False)


 @use_np
 def test_np_prod():
     class TestProd(HybridBlock):
         def __init__(self, axis=None, dtype=None, keepdims=False):
             super(TestProd, self).__init__()
             self._axis = axis
             self._dtype = dtype
             self._keepdims = keepdims

         def forward(self, a, *args, **kwargs):
             return np.prod(a, axis=self._axis, dtype=self._dtype, keepdims=self._keepdims)

     in_data_dim = random.choice([3, 4])
     shape = rand_shape_nd(in_data_dim, dim=3)
     for hybridize in [False, True]:
         for keepdims in [True, False]:
             for axis in ([i for i in range(in_data_dim)] + [(), None]):
                 for itype in ['float32', 'float64']:
                     for dtype in ['float32', 'float64']:
                         # test gluon
                         test_prod = TestProd(axis=axis, dtype=dtype, keepdims=keepdims)
                         if hybridize:
                             test_prod.hybridize()
                         x = np.array(onp.random.uniform(-2.0, 2.0, size=shape), dtype=itype)
                         x.attach_grad()
                         expected_ret = onp.prod(x.asnumpy(), axis=axis, keepdims=keepdims)
                         expected_ret = expected_ret.astype(dtype)
                         with mx.autograd.record():
                             y = test_prod(x)
                         assert y.shape == expected_ret.shape
                         assert_almost_equal(y.asnumpy(), expected_ret, rtol=1e-3, atol=1e-5, use_broadcast=False)
                         y.backward()
                         # use keepdims=True so that broadcast divide can be used to calculate
                         # grad of input
                         expected_ret = onp.prod(x.asnumpy(), axis=axis, keepdims=True)
                         assert_almost_equal(x.grad.asnumpy(), expected_ret / x.asnumpy(), rtol=1e-3, atol=1e-3,
                                             use_broadcast=False)

                         # test numeric
                         if itype == 'float32' and dtype == 'float32':
                             x_sym = mx.sym.Variable("x").as_np_ndarray()
                             mx_sym = mx.sym.np.prod(x_sym, axis=axis, dtype=dtype, keepdims=keepdims).as_nd_ndarray()
                             check_numeric_gradient(mx_sym, [x.as_nd_ndarray()],
                                                    numeric_eps=1e-3, rtol=1e-3, atol=1e-4, dtype=onp.float32)

                         # test imperative
                         mx_out = np.prod(x, axis=axis, dtype=dtype, keepdims=keepdims)
                         np_out = onp.prod(x.asnumpy(), axis=axis, keepdims=keepdims).astype(dtype)
                         assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5, use_broadcast=False)


 @use_np
 def test_np_flatten():
     class TestFlatten(HybridBlock):
         def forward(self, x):
             return x.flatten()

     shapes = [(), (2, 0, 1), (3, 4, 5), 6, (0,), (0, 0, 0)]
     for shape in shapes:
         for hybridize in [True, False]:
             test_flatten = TestFlatten()
             if hybridize:
                 test_flatten.hybridize()
             a_np = onp.random.uniform(size=shape).astype('float32')
             a_mx = np.array(a_np, dtype=a_np.dtype)
             a_mx.attach_grad()
             with mx.autograd.record():
                 ret = test_flatten(a_mx)
             expected_ret = a_np.flatten()
             assert_almost_equal(expected_ret, ret.asnumpy(), rtol=1e-5, atol=1e-6, use_broadcast=False)
             # check gradient
             ret.backward()
             assert_almost_equal(a_mx.grad.asnumpy(), onp.ones_like(a_np), rtol=1e-5, atol=1e-6, use_broadcast=False)


 @use_np
 @pytest.mark.parametrize('src_shape,dst_shape', [
     ((), (1, 2, 4, 5)),
     ((1,), (4, 5, 6)),
     ((1, 0), (2, 4, 0)),
     ((1, 1), (2, 4, 0)),
     ((4, 1), (1, 2, 3, 4, 5)),
     ((4, 1), (1, 0, 3, 4, 5))
 ])
 @pytest.mark.parametrize('hybridize', [True, False])
 def test_np_broadcast_to(src_shape, dst_shape, hybridize):
     class TestBroadcastTo(HybridBlock):
         def __init__(self, dst_shape):
             super(TestBroadcastTo, self).__init__()
             self._dst_shape = dst_shape

         def forward(self, x):
             return np.broadcast_to(x, self._dst_shape)

     class TestScalarBroadcastTo(HybridBlock):
         def __init__(self, scalar, dst_shape):
             super(TestScalarBroadcastTo, self).__init__()
             self._scalar = scalar
             self._dst_shape = dst_shape

         def forward(self, x):
             return np.broadcast_to(self._scalar, self._dst_shape)

     test_broadcast_to = TestBroadcastTo(dst_shape)
     if hybridize:
         test_broadcast_to.hybridize()

     a = onp.random.uniform(size=src_shape).astype(np.float32)
     expected_ret = onp.broadcast_to(a, dst_shape)
     a_mx = np.array(a, dtype=a.dtype)
     a_mx.attach_grad()
     with mx.autograd.record():
         ret = test_broadcast_to(a_mx)
     assert_almost_equal(ret.asnumpy(), expected_ret, rtol=1e-5, atol=1e-6, use_broadcast=False)
     ret.backward()
     expected_grad = collapse_sum_like(onp.ones_like(expected_ret), src_shape)
     assert_almost_equal(a_mx.grad.asnumpy(), expected_grad, rtol=1e-5, atol=1e-6, use_broadcast=False)

     # Test scalar case
     scalar = 1.0
     test_scalar_broadcast_to = TestScalarBroadcastTo(scalar, dst_shape)
     expected_ret = onp.broadcast_to(scalar, dst_shape)
     with mx.autograd.record():
         # `np.empty(())` serves as a dummpy input
         ret = test_scalar_broadcast_to(np.empty(()))
     assert_almost_equal(ret.asnumpy(), expected_ret, rtol=1e-5, atol=1e-6, use_broadcast=False)

 @use_np
 @pytest.mark.parametrize('src_shape,npx_dst_shape,np_dst_shape', [
     ((5,), (3, 4, -2), (3, 4, 5)),
     ((5,), (0, -2), (0, 5)),
     ((1, 0), (2, -2, -2), (2, 1, 0)),
     ((3, 4), (1, 2, 3, -2), (1, 2, 3, 4)),
     ((3, 4), (1, 0, -2, 4), (1, 0, 3, 4))
 ])
 @pytest.mark.parametrize('hybridize', [True, False])
 def test_np_broadcast_to_npx(src_shape, npx_dst_shape, np_dst_shape, hybridize):
     class TestBroadcastTo(HybridBlock):
         def __init__(self, dst_shape):
             super(TestBroadcastTo, self).__init__()
             self._dst_shape = dst_shape

         def forward(self, x):
             return np.broadcast_to(x, self._dst_shape)

     class TestScalarBroadcastTo(HybridBlock):
         def __init__(self, scalar, dst_shape):
             super(TestScalarBroadcastTo, self).__init__()
             self._scalar = scalar
             self._dst_shape = dst_shape

         def forward(self, x):
             return np.broadcast_to(self._scalar, self._dst_shape)

     test_broadcast_to = TestBroadcastTo(npx_dst_shape)
     if hybridize:
         test_broadcast_to.hybridize()

     a = onp.random.uniform(size=src_shape).astype(np.float32)
     expected_ret = onp.broadcast_to(a, np_dst_shape)
     a_mx = np.array(a, dtype=a.dtype)
     a_mx.attach_grad()
     with mx.autograd.record():
         ret = test_broadcast_to(a_mx)
     assert_almost_equal(ret.asnumpy(), expected_ret, rtol=1e-5, atol=1e-6, use_broadcast=False)
     ret.backward()
     expected_grad = collapse_sum_like(onp.ones_like(expected_ret), src_shape)
     assert_almost_equal(a_mx.grad.asnumpy(), expected_grad, rtol=1e-5, atol=1e-6, use_broadcast=False)


 @use_np
 @pytest.mark.parametrize('hybridize', [True, False])
 @pytest.mark.parametrize('dtype', [onp.float32, onp.float16, onp.int32])
 @pytest.mark.parametrize('data_shape,axes_workload', [
     [(), [(), None]],
     [(2,), [(0,), None]],
     [(0, 2), [(0, 1), (1, 0)]],
     [(5, 10), [(0, 1), (1, 0), None]],
     [(8, 2, 3), [(2, 0, 1), (0, 2, 1), (0, 1, 2), (2, 1, 0), (-1, 1, 0), None]],
     [(8, 2, 16), [(0, 2, 1), (2, 0, 1), (0, 1, 2), (2, 1, 0), (-1, -2, -3)]],
     [(8, 3, 4, 8), [(0, 2, 3, 1), (1, 2, 3, 0), (0, 3, 2, 1)]],
     [(8, 3, 2, 3, 8), [(0, 1, 3, 2, 4), (0, 1, 2, 3, 4), (4, 0, 1, 2, 3)]],
     [(3, 4, 3, 4, 3, 2), [(0, 1, 3, 2, 4, 5), (2, 3, 4, 1, 0, 5), None]],
     [(3, 4, 3, 4, 3, 2, 2), [(0, 1, 3, 2, 4, 5, 6),
      (2, 3, 4, 1, 0, 5, 6), None]],
     [(3, 4, 3, 4, 3, 2, 3, 2), [(0, 1, 3, 2, 4, 5, 7, 6),
      (2, 3, 4, 1, 0, 5, 7, 6), None]],
 ])
 @pytest.mark.parametrize('grad_req', ['write', 'add'])
 def test_np_transpose(data_shape, axes_workload, hybridize, dtype, grad_req):
     def np_transpose_grad(out_shape, dtype, axes=None):
         ograd = onp.ones(out_shape, dtype=dtype)
         if axes is None or axes == ():
             return onp.transpose(ograd, axes)
         np_axes = onp.array(list(axes))
         transpose_axes = onp.zeros_like(np_axes)
         transpose_axes[np_axes] = onp.arange(len(np_axes))
         return onp.transpose(ograd, tuple(list(transpose_axes)))

     class TestTranspose(HybridBlock):
         def __init__(self, axes=None):
             super(TestTranspose, self).__init__()
             self.axes = axes

         def forward(self, a):
             return np.transpose(a, self.axes)

     for axes in axes_workload:
         test_trans = TestTranspose(axes)
         if hybridize:
             test_trans.hybridize()
         x = np.random.normal(0, 1, data_shape).astype(dtype)
         x = x.astype(dtype)
         x.attach_grad(grad_req=grad_req)
         if grad_req == 'add':
             x.grad[()] = np.random.normal(0, 1, x.grad.shape).astype(x.grad.dtype)
             x_grad_np = x.grad.asnumpy()
         np_out = onp.transpose(x.asnumpy(), axes)
         with mx.autograd.record():
             mx_out = test_trans(x)
         assert mx_out.shape == np_out.shape
         assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5, use_broadcast=False)
         mx_out.backward()
         np_backward = np_transpose_grad(np_out.shape, dtype, axes)
         if grad_req == 'add':
             assert_almost_equal(x.grad.asnumpy(), np_backward + x_grad_np,
                                 rtol=1e-3, atol=1e-5, use_broadcast=False)
         else:
             assert_almost_equal(x.grad.asnumpy(), np_backward, rtol=1e-3, atol=1e-5, use_broadcast=False)

         mx_out = x.transpose(axes)
         np_out = x.asnumpy().transpose(axes)
         assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5, use_broadcast=False)

         if isinstance(axes, (list, tuple)):
             mx_out = x.transpose(*axes)
             np_out = x.asnumpy().transpose(*axes)
             assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5, use_broadcast=False)


 @use_np
 def test_np_transpose_error():
     # Test for error raising
     dat = np.random.normal(0, 1, (3, 4, 5), dtype=np.float32)
     pytest.raises(ValueError, lambda: dat.transpose((0, 0, 1)))
     pytest.raises(MXNetError, lambda: dat.transpose((0, 1, 3)))


 @use_np
 @pytest.mark.parametrize('hybridize', [True, False])
 @pytest.mark.parametrize('dtype', [onp.float32, onp.float16, onp.int32])
 @pytest.mark.parametrize('data_shape,axes_workload', [
     [(), [(), None]],
     [(2,), [(0,), None]],
     [(0, 2), [(0, 1), (1, 0)]],
     [(5, 10), [(0, 1), (1, 0), None]],
     [(8, 2, 3), [(2, 0, 1), (0, 2, 1), (0, 1, 2), (2, 1, 0), (-1, 1, 0), None]],
     [(8, 2, 16), [(0, 2, 1), (2, 0, 1), (0, 1, 2), (2, 1, 0), (-1, -2, -3)]],
     [(8, 3, 4, 8), [(0, 2, 3, 1), (1, 2, 3, 0), (0, 3, 2, 1)]],
     [(8, 3, 2, 3, 8), [(0, 1, 3, 2, 4), (0, 1, 2, 3, 4), (4, 0, 1, 2, 3)]],
     [(3, 4, 3, 4, 3, 2), [(0, 1, 3, 2, 4, 5), (2, 3, 4, 1, 0, 5), None]],
     [(3, 4, 3, 4, 3, 2, 2), [(0, 1, 3, 2, 4, 5, 6),
      (2, 3, 4, 1, 0, 5, 6), None]],
     [(3, 4, 3, 4, 3, 2, 3, 2), [(0, 1, 3, 2, 4, 5, 7, 6),
      (2, 3, 4, 1, 0, 5, 7, 6), None]],
 ])
 @pytest.mark.parametrize('grad_req', ['write', 'add'])
 def test_np_permute_dims(data_shape, axes_workload, hybridize, dtype, grad_req):
     def np_permute_dims_grad(out_shape, dtype, axes=None):
         ograd = onp.ones(out_shape, dtype=dtype)
         if axes is None or axes == ():
             return onp.transpose(ograd, axes)
         np_axes = onp.array(list(axes))
         permute_dims_axes = onp.zeros_like(np_axes)
         permute_dims_axes[np_axes] = onp.arange(len(np_axes))
         return onp.transpose(ograd, tuple(list(permute_dims_axes)))

     class TestPermuteDims(HybridBlock):
         def __init__(self, axes=None):
             super(TestPermuteDims, self).__init__()
             self.axes = axes

         def forward(self, a):
             return np.permute_dims(a, self.axes)

     for axes in axes_workload:
         test_trans = TestPermuteDims(axes)
         if hybridize:
             test_trans.hybridize()
         x = np.random.normal(0, 1, data_shape).astype(dtype)
         x = x.astype(dtype)
         x.attach_grad(grad_req=grad_req)
         if grad_req == 'add':
             x.grad[()] = np.random.normal(0, 1, x.grad.shape).astype(x.grad.dtype)
             x_grad_np = x.grad.asnumpy()
         np_out = onp.transpose(x.asnumpy(), axes)
         with mx.autograd.record():
             mx_out = test_trans(x)
         assert mx_out.shape == np_out.shape
         assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5, use_broadcast=False)
         mx_out.backward()
         np_backward = np_permute_dims_grad(np_out.shape, dtype, axes)
         if grad_req == 'add':
             assert_almost_equal(x.grad.asnumpy(), np_backward + x_grad_np,
                                 rtol=1e-3, atol=1e-5, use_broadcast=False)
         else:
             assert_almost_equal(x.grad.asnumpy(), np_backward, rtol=1e-3, atol=1e-5, use_broadcast=False)


 @use_np
 def test_np_meshgrid():
     nx, ny = (4, 5)
     x = np.array(onp.linspace(0, 1, nx), dtype=np.float32)
     y = np.array(onp.linspace(0, 1, ny), dtype=np.float32)
     z = np.ones(())
     xv, yv, zv = np.meshgrid(x, y, z)
     xv_expected, yv_expected, zv_expected = onp.meshgrid(x.asnumpy(), y.asnumpy(), z.asnumpy())
     assert same(xv.asnumpy(), xv_expected)
     assert same(yv.asnumpy(), yv_expected)
     assert same(zv.asnumpy(), zv_expected)


 @use_np
 @pytest.mark.parametrize('shapes', [
     [(), (2, 1), (1, 3), (4, 1, 1), (5, 4, 2, 3)],
     [(0,), (), (2, 1), (1, 0), (3, 2, 1)]
 ])
 def test_np_broadcast_arrays(shapes):
     arrays_np = [onp.random.randint(low=0, high=1000, size=shape, dtype=onp.int32) for shape in shapes]
     arrays_mx = [np.array(arr, dtype=arr.dtype) for arr in arrays_np]
     expected_rets = onp.broadcast_arrays(*arrays_np)
     rets = np.broadcast_arrays(*arrays_mx)
     for expected_ret, ret in zip(expected_rets, rets):
         assert same(expected_ret, ret.asnumpy())


 @use_np
 def test_np_tile():
     config = [
         ((), ()),
         ((), 0),
         ((), (2, 0)),
         ((), (2, 3)),
         ((4, 2), (2,)),
         ((4, 2), (2, 3)),
         ((4, 2), (2, 1, 4)),
         ((4, 2), (2, 3, 4)),
         ((4, 2), (2, 0)),
         ((4, 2), (2, 0, 3)),
         ((4, 2), (2, 0, 3)),
         ((4, 0), (2, 0, 3)),
     ]

     class TestTile(HybridBlock):
         def __init__(self, reps):
             super(TestTile, self).__init__()
             self._reps = reps

         def forward(self, x):
             return np.tile(x, reps=self._reps)

     for shape, reps in config:
         data_np = onp.random.randint(low=0, high=1000, size=shape)
         data_mx = np.array(data_np, dtype=data_np.dtype)
         ret_np = onp.tile(data_np, reps=reps)
         ret_mx = np.tile(data_mx, reps=reps)
         assert same(ret_mx.asnumpy(), ret_np)

         net = TestTile(reps)
         for hybrid in [False, True]:
             if hybrid:
                 net.hybridize()
             ret_mx = net(data_mx)
             assert same(ret_mx.asnumpy(), ret_np)


 @use_np
 def test_np_tril():
     # numpy tril does not support scalar array (zero-dim)
     config = [
         ((4, 2), 3),
         ((4, 2), 9),
         ((4, 2), 0),
         ((4, 2), -1),
         ((4, 5, 6), 0),
         ((4, 5, 6), 5),
         ((4, 5, 6), 2),
         ((4, 5, 6), -2),
         ((4, 5, 6), -5),
         ((4, 0), 0),
         ((4, 0), 2),
         ((4, 0), 4),
         ((4, 0), -3),
         ((4, 0, 5), 0),
         ((4, 0, 5), 1),
         ((4, 0, 5), 5),
         ((4, 0, 5), -3),
         ((3, ), 0),
         ((3, ), 2),
         ((3, ), 5)
     ]

     class TestTril(HybridBlock):
         def __init__(self, k):
             super(TestTril, self).__init__()
             self._k = k

         def forward(self, x):
             return np.tril(x, k=self._k)

     for prefix in [1, -1]:
         for shape, k in config:
             data_np = onp.random.uniform(size=shape).astype(onp.float32)
             data_mx = np.array(data_np, dtype=data_np.dtype)
             data_mx.attach_grad()
             ret_np = onp.tril(data_np, k*prefix)
             with mx.autograd.record():
                 ret_mx = np.tril(data_mx, k*prefix)
             assert same(ret_mx.asnumpy(), ret_np)
             ret_mx.backward()
             if len(shape) == 2:
                 grad_np = onp.tri(*shape, k=k*prefix)
                 assert same(data_mx.grad.asnumpy(), grad_np)
             if len(shape) == 1:
                 grad_np = onp.tri(*shape, k=k*prefix)
                 grad_np = grad_np.sum(axis=0, keepdims=False)
                 assert same(data_mx.grad.asnumpy(), grad_np)

             net = TestTril(k*prefix)
             for hybrid in [False, True]:
                 if hybrid:
                     net.hybridize()
                 ret_mx = net(data_mx)
                 assert same(ret_mx.asnumpy(), ret_np)


 @use_np
 def test_np_triu():
     # numpy triu does not support scalar array (zero-dim)
     config = [
         ((4, 2), 3),
         ((4, 2), 9),
         ((4, 2), 0),
         ((4, 2), -1),
         ((4, 5, 6), 0),
         ((4, 5, 6), 5),
         ((4, 5, 6), 2),
         ((4, 5, 6), -2),
         ((4, 5, 6), -5),
         ((4, 0), 0),
         ((4, 0), 2),
         ((4, 0), 4),
         ((4, 0), -3),
         ((4, 0, 5), 0),
         ((4, 0, 5), 1),
         ((4, 0, 5), 5),
         ((4, 0, 5), -3),
         ((3, ), 0),
         ((3, ), 2),
         ((3, ), 5)
     ]

     class TestTriu(HybridBlock):
         def __init__(self, k):
             super(TestTriu, self).__init__()
             self._k = k

         def forward(self, x):
             return np.triu(x, k=self._k)

     for prefix in [1, -1]:
         for shape, k in config:
             data_np = onp.random.uniform(size=shape).astype(onp.float32)
             data_mx = np.array(data_np, dtype=data_np.dtype)
             data_mx.attach_grad()
             ret_np = onp.triu(data_np, k*prefix)
             with mx.autograd.record():
                 ret_mx = np.triu(data_mx, k*prefix)
             assert same(ret_mx.asnumpy(), ret_np)
             ret_mx.backward()
             if len(shape) == 2:
                 grad_np = onp.triu(onp.ones_like(data_np), k*prefix)
                 assert same(data_mx.grad.asnumpy(), grad_np)
             if len(shape) == 1:
                 grad_np = onp.triu(onp.ones(shape), k*prefix)
                 grad_np = grad_np.sum(axis=0, keepdims=False)
                 assert same(data_mx.grad.asnumpy(), grad_np)

             net = TestTriu(k*prefix)
             for hybrid in [False, True]:
                 if hybrid:
                     net.hybridize()
                 ret_mx = net(data_mx)
                 assert same(ret_mx.asnumpy(), ret_np)


 @use_np
 def test_np_unary_funcs():
     def check_unary_func(func, ref_grad, shape, low, high):
         class TestUnary(HybridBlock):
             def __init__(self, func):
                 super(TestUnary, self).__init__()
                 self._func = func

             def forward(self, a, *args, **kwargs):
                 return getattr(np, self._func)(a)

         np_func = getattr(onp, func)
         np_test_data = onp.random.uniform(low, high, shape).astype(onp.float32)
         mx_test_data = mx.numpy.array(np_test_data)
         for hybridize in [True, False]:
             mx_func = TestUnary(func)
             if hybridize:
                 mx_func.hybridize()
             if ref_grad:
                 mx_test_data.attach_grad()
             np_out = np_func(np_test_data)
             with mx.autograd.record():
                 y = mx_func(mx_test_data)
             assert y.shape == np_out.shape
             assert_almost_equal(y.asnumpy(), np_out, rtol=1e-3, atol=1e-5)
             if np_out.dtype == np.bool_:
                 assert y.dtype == np.bool_

             if ref_grad:
                 y.backward()
                 assert_almost_equal(mx_test_data.grad.asnumpy(), ref_grad(np_test_data), rtol=1e-1, atol=1e-2, equal_nan=True)

         np_out = getattr(onp, func)(np_test_data)
         mx_out = getattr(mx.np, func)(mx_test_data)
         assert mx_out.shape == np_out.shape
         assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)


         assertRaises(NotImplementedError, getattr(np, func), mx_test_data, where=False)
         assertRaises(NotImplementedError, getattr(np, func), mx_test_data,  subok=False)
         assertRaises(NotImplementedError, getattr(np, func), mx_test_data,  dtype=onp.int8)
         assertRaises(TypeError, getattr(np, func), mx_test_data,  dtype="abcdefg")
         assertRaises(NotImplementedError, getattr(np, func), mx_test_data,  casting='safe')
         assertRaises(TypeError, getattr(np, func), mx_test_data,  casting='mxnet')
         assertRaises(NotImplementedError, getattr(np, func), mx_test_data,  order='C')
         assertRaises(NotImplementedError, getattr(np, func), mx_test_data,  order='mxnet')

     funcs = {
         'absolute' : (lambda x: -1. * (x < 0) + (x > 0), -1.0, 1.0),
         'logical_not' : (None, -1.0, 1.0),
         'negative' : (lambda x: -1. * onp.ones(x.shape), -1.0, 1.0),
         'positive' : (lambda x: onp.ones(x.shape), -1.0, 1.0),
         'reciprocal' : (lambda x: -1. / (x ** 2), 0.01, 1.0),
         'sign' : (None, -1.0, 1.0),
         'square' : (lambda x: 2.0 * x, -1.0, 1.0),
     }
     if has_tvm_ops():
         funcs['rad2deg'] = (lambda x: 180. / onp.pi * onp.ones(x.shape), -1.0, 1.0)
         funcs['deg2rad'] = (lambda x: onp.pi / 180. * onp.ones(x.shape), -1.0, 1.0)
     ndim = random.choice([2, 3, 4])
     for shape in [rand_shape_nd(ndim, dim=3), (1, 0, 2)]:
         for func, func_data in funcs.items():
             ref_grad, low, high = func_data
             check_unary_func(func, ref_grad, shape, low, high)


 @use_np
 def test_negation():
     class TestNegation(HybridBlock):
         def forward(self, a):
             return -a
     mx_func = TestNegation()
     for dtype in [onp.int8, onp.int32, onp.float16, onp.float32, onp.float64]:
         np_test_data = onp.random.uniform(-1, 1, (5, 5)).astype(dtype)
         for hybridize in [True, False]:
             mx_test_data = mx.numpy.array(np_test_data, dtype=dtype)
             if hybridize:
                 mx_func.hybridize()
             y = mx_func(mx_test_data)
             assert y.shape == (5, 5)
             assert y.dtype == dtype
             assert_almost_equal(y.asnumpy(), -np_test_data)


 @use_np
 @retry(3)
 @pytest.mark.parametrize('func,ref_grad,low,high', [
     ('cbrt', lambda x: 1. / (3. * onp.cbrt(x) ** 2), -1.0, 1.0),
     ('ceil', None, -10.0, 10.0),
     ('exp', lambda x: onp.exp(x), -1.0, 1.0),
     ('expm1', lambda x: onp.exp(x), -1.0, 1.0),
     ('fix', None, -10.0, 10.0),
     ('floor', None, -10.0, 10.0),
     ('log', lambda x: 1.0 / x, 0.1, 5.0),
     ('log10', lambda x: 1.0 / (x * onp.log(10)), 0.1, 10.0),
     ('log1p', lambda x: 1.0 / (1.0 + x), -0.9, 5.0),
     ('log2', lambda x: 1.0 / (x * onp.log(2)), 0.1, 2.0),
     ('rint', None, -5.0, 5.0),
     ('sqrt', lambda x: 0.5 / onp.sqrt(x), 0.001, 10.0),
     ('trunc', None, -5.0, 5.0),
     ('sin', lambda x: onp.cos(x), -1.0, 1.0),
     ('cos', lambda x: -onp.sin(x), -1.0, 1.0),
     ('tan', lambda x: onp.tan(x) ** 2 + 1.0, -1.0, 1.0),
     ('arcsin', lambda x: 1. / (1. - x ** 2) ** (1. / 2.), -1.0, 1.0),
     ('arccos', lambda x: -1. / (1. - x ** 2.) ** (1. / 2.), -1.0, 1.0),
     ('arctan', lambda x: 1. / (x ** 2. + 1.), -1.0, 1.0),
     ('degrees', lambda x: 180. / onp.pi * onp.ones(x.shape), -1.0, 1.0),
     ('radians', lambda x: onp.pi / 180. * onp.ones(x.shape), -1.0, 1.0),
     ('sinh', lambda x: onp.cosh(x), -1.0, 1.0),
     ('cosh', lambda x: onp.sinh(x), -1.0, 1.0),
     ('tanh', lambda x: 1. - onp.tanh(x) ** 2, -1.0, 1.0),
     ('arcsinh', lambda x: 1./(x**2 + 1.)**(1./2.), -1.0, 1.0),
     ('arccosh', lambda x: 1./(x**2 - 1.)**(1./2.), 2.0, 5.0),
     ('arctanh', lambda x: -1./(x**2 - 1.), -0.99, 0.99)
 ])
 @pytest.mark.parametrize('ndim', [2, 3, 4])
 @pytest.mark.parametrize('dtype', ['float16', 'float32', 'float64', 'int8', 'uint8', 'int32', 'int64', 'bool'])
 def test_np_mixedType_unary_funcs(func, ref_grad, low, high, ndim, dtype):
     class TestMixedUnary(HybridBlock):
         def __init__(self, func):
             super(TestMixedUnary, self).__init__()
             self._func = func

         def forward(self, a, *args, **kwargs):
             return getattr(np, self._func)(a)

     import math

     shapes = [i for i in [rand_shape_nd(ndim, dim=3), (1, 0, 2)]];
     for shape in shapes:
         print(func, dtype, shape)
         rtol = 1e-2 if dtype == np.float16 else 1e-3
         atol = 1e-4 if dtype == np.float16 else 1e-5
         # get rid of warning: divide by zero
         if((func=='log' or func=='log10' or func=='log2') and
             (dtype=='int8' or dtype=='uint8' or dtype=='int32' or
             dtype=='int64')):
             low = 1
         if (func=='arctanh' and dtype=='bool'):
             continue
         np_func = getattr(onp, func)
         mx_func = TestMixedUnary(func)
         np_test_data = onp.random.uniform(low, high, shape).astype(dtype)
         mx_test_data = np.array(np_test_data)
         for hybridize in [True, False]:
             if hybridize:
                 mx_func.hybridize()
             if ref_grad:
                 mx_test_data.attach_grad()
             np_out = np_func(np_test_data)
             with mx.autograd.record():
                 y = mx_func(mx_test_data)
             assert y.shape == np_out.shape
             assert_almost_equal(y.asnumpy(), np_out, rtol=1e-3, atol=1e-5)
             if np_out.dtype == np.bool_:
                 assert y.dtype == np.bool_

             if ref_grad and (dtype == 'float16' or dtype == 'float32' or dtype == 'float64'):
                 y.backward()
                 assert_almost_equal(mx_test_data.grad.asnumpy(), ref_grad(np_test_data), rtol=1e-1, atol=1e-2, equal_nan=True)

         np_out = getattr(onp, func)(np_test_data)
         mx_out = getattr(mx.np, func)(mx_test_data)
         assert mx_out.shape == np_out.shape
         assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)

         assertRaises(NotImplementedError, getattr(np, func), mx_test_data, where=False)
         assertRaises(NotImplementedError, getattr(np, func), mx_test_data, subok=False)
         assertRaises(NotImplementedError, getattr(np, func), mx_test_data, dtype=onp.int8)
         assertRaises(TypeError, getattr(np, func), mx_test_data, dtype="abcdefg")
         assertRaises(NotImplementedError, getattr(np, func), mx_test_data, casting='safe')
         assertRaises(TypeError, getattr(np, func), mx_test_data, casting='mxnet')
         assertRaises(NotImplementedError, getattr(np, func), mx_test_data, order='C')
         assertRaises(NotImplementedError, getattr(np, func), mx_test_data, order='mxnet')


 @use_np
 @pytest.mark.parametrize('ndim', [2, 3, 4])
 @pytest.mark.parametrize('func,low,high', [
     ('bitwise_not', -5, 5),
     ('invert', -5, 5),
 ])
 def test_np_bitwise_not(func, low, high, ndim):
     def check_unary_func(func, shape, low, high):
         class TestUnary(HybridBlock):
             def __init__(self, func):
                 super(TestUnary, self).__init__()
                 self._func = func

             def forward(self, a, *args, **kwargs):
                 return getattr(np, self._func)(a)

         np_func = getattr(onp, func)
         mx_func = TestUnary(func)
         np_test_data = onp.random.uniform(low, high, shape).astype(onp.int32)
         mx_test_data = mx.numpy.array(np_test_data).astype(onp.int32)
         for hybridize in [True, False]:
             if hybridize:
                 mx_func.hybridize()
             np_out = np_func(np_test_data)
             with mx.autograd.record():
                 y = mx_func(mx_test_data)
             assert y.shape == np_out.shape
             assert_almost_equal(y.asnumpy(), np_out, rtol=1e-3, atol=1e-5)
             if np_out.dtype == np.bool_:
                 assert y.dtype == np.bool_

         np_out = getattr(onp, func)(np_test_data)
         mx_out = getattr(mx.np, func)(mx_test_data)
         assert mx_out.shape == np_out.shape
         assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)

         assertRaises(NotImplementedError, getattr(np, func), mx_test_data, where=False)
         assertRaises(NotImplementedError, getattr(np, func), mx_test_data,  subok=False)
         assertRaises(NotImplementedError, getattr(np, func), mx_test_data,  dtype=onp.int8)
         assertRaises(TypeError, getattr(np, func), mx_test_data,  dtype="abcdefg")
         assertRaises(NotImplementedError, getattr(np, func), mx_test_data,  casting='safe')
         assertRaises(TypeError, getattr(np, func), mx_test_data,  casting='mxnet')
         assertRaises(NotImplementedError, getattr(np, func), mx_test_data,  order='C')
         assertRaises(NotImplementedError, getattr(np, func), mx_test_data,  order='mxnet')

     shape = random.choice([rand_shape_nd(ndim, dim=3), (1, 0, 2)])
     for shape in [rand_shape_nd(ndim, dim=3), (1, 0, 2)]:
         check_unary_func(func, shape, low, high)


 @use_np
 @pytest.mark.parametrize('ndim', [2, 3, 4])
 @pytest.mark.parametrize('func,low,high', [
     ('left_shift', -5, 5),
     ('right_shift', -5, 5),
 ])
 def test_np_bitwise_shift(func, low, high, ndim):
     def check_unary_func(func, shape, low, high):
         class TestUnary(HybridBlock):
             def __init__(self, func):
                 super(TestUnary, self).__init__()
                 self._func = func

             def forward(self, a, b, *args, **kwargs):
                 return getattr(np, self._func)(a, b)

         np_func = getattr(onp, func)
         mx_func = TestUnary("bitwise_" + func)
         np_test_data1 = onp.random.randint(low, high, shape).astype(onp.int64)
         np_test_data2 = onp.random.randint(low + 5, high + 5, shape).astype(onp.int64)
         mx_test_data1 = mx.numpy.array(np_test_data1).astype(onp.int64)
         mx_test_data2 = mx.numpy.array(np_test_data2).astype(onp.int64)
         for hybridize in [True, False]:
             if hybridize:
                 mx_func.hybridize()
             np_out = np_func(np_test_data1, np_test_data2)
             with mx.autograd.record():
                 y = mx_func(mx_test_data1, mx_test_data2)
             assert y.shape == np_out.shape
             assert_almost_equal(y.asnumpy(), np_out, rtol=1e-3, atol=1e-5)
             if np_out.dtype == np.bool_:
                 assert y.dtype == np.bool_

         np_out = getattr(onp, func)(np_test_data1, np_test_data2)
         mx_out = getattr(mx.np, "bitwise_" + func)(mx_test_data1, mx_test_data2)
         assert mx_out.shape == np_out.shape
         assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)

         assertRaises(TypeError, getattr(np, "bitwise_" + func), mx_test_data1, mx_test_data2, where=False)
         assertRaises(TypeError, getattr(np, "bitwise_" + func), mx_test_data1, mx_test_data2, subok=False)
         assertRaises(TypeError, getattr(np, "bitwise_" + func), mx_test_data1, mx_test_data2, dtype=onp.int8)
         assertRaises(TypeError, getattr(np, "bitwise_" + func), mx_test_data1, mx_test_data2, dtype="abcdefg")
         assertRaises(TypeError, getattr(np, "bitwise_" + func), mx_test_data1, mx_test_data2, casting='safe')
         assertRaises(TypeError, getattr(np, "bitwise_" + func), mx_test_data1, mx_test_data2, casting='mxnet')
         assertRaises(TypeError, getattr(np, "bitwise_" + func), mx_test_data1, mx_test_data2, order='C')
         assertRaises(TypeError, getattr(np, "bitwise_" + func), mx_test_data1, mx_test_data2, order='mxnet')

     shape = random.choice([rand_shape_nd(ndim, dim=3), (1, 0, 2)])
     for shape in [rand_shape_nd(ndim, dim=3), (1, 0, 2)]:
         check_unary_func(func, shape, low, high)


 @use_np
 def test_np_binary_funcs():
     def check_binary_func(func, lshape, rshape, low, high, lgrads, rgrads=None, alltypes=None):
         class TestBinary(HybridBlock):
             def __init__(self, func):
                 super(TestBinary, self).__init__()
                 self._func = func

             def forward(self, a, b, *args, **kwargs):
                 return getattr(np, self._func)(a, b)

         np_func = getattr(onp, func)
         mx_func = TestBinary(func)
         alltypes = alltypes if alltypes else [[onp.float16, onp.float32, onp.float64]]
         for dtypes, lgrad, rgrad in zip(alltypes, lgrads, rgrads if rgrads else lgrads):
             for dtype in dtypes:
                 ldtype = rdtype = dtype
                 if isinstance(dtype, tuple):
                     assert len(dtype) == 2
                     ldtype, rdtype = dtype
                 npldtype = ldtype if dtype != onp.float16 else onp.float32
                 nprdtype = rdtype if dtype != onp.float16 else onp.float32
                 np_test_x1 = onp.random.uniform(low, high, lshape).astype(ldtype).astype(npldtype)
                 np_test_x2 = onp.random.uniform(low, high, rshape).astype(rdtype).astype(nprdtype)
                 mx_test_x1 = mx.numpy.array(np_test_x1, dtype=ldtype)
                 mx_test_x2 = mx.numpy.array(np_test_x2, dtype=rdtype)
                 for hybridize in [True, False]:
                     if hybridize:
                         mx_func.hybridize()
                     if lgrad:
                         mx_test_x1.attach_grad()
                         mx_test_x2.attach_grad()
                     np_out = np_func(np_test_x1, np_test_x2)
                     with mx.autograd.record():
                         y = mx_func(mx_test_x1, mx_test_x2)
                     assert y.shape == np_out.shape
                     assert_almost_equal(y.asnumpy(), np_out.astype(y.dtype), rtol=1e-3, atol=1e-5,
                                         use_broadcast=False, equal_nan=True)

                     if lgrad:
                         y.backward()
                         assert_almost_equal(mx_test_x1.grad.asnumpy(),
                                             collapse_sum_like(lgrad(y.asnumpy(), np_test_x1, np_test_x2), mx_test_x1.shape),
                                             rtol=1e-1, atol=1e-2, equal_nan=True, use_broadcast=False)
                         if rgrads is None:
                             assert_almost_equal(mx_test_x2.grad.asnumpy(),
                                                 collapse_sum_like(rgrad(y.asnumpy(), np_test_x2, np_test_x1), mx_test_x2.shape),
                                                 rtol=1e-1, atol=1e-2, equal_nan=True, use_broadcast=False)
                         else:
                             assert_almost_equal(mx_test_x2.grad.asnumpy(),
                                                 collapse_sum_like(rgrad(y.asnumpy(), np_test_x1, np_test_x2), mx_test_x2.shape),
                                                 rtol=1e-1, atol=1e-2, equal_nan=True, use_broadcast=False)

                 np_out = getattr(onp, func)(np_test_x1, np_test_x2)
                 mx_out = getattr(mx.np, func)(mx_test_x1, mx_test_x2)
                 assert mx_out.shape == np_out.shape
                 assert_almost_equal(mx_out.asnumpy(), np_out.astype(mx_out.dtype), rtol=1e-3, atol=1e-5,
                                     use_broadcast=False, equal_nan=True)

                 assertRaises(NotImplementedError, getattr(np, func), mx_test_x1, mx_test_x2, where=False)
                 assertRaises(NotImplementedError, getattr(np, func), mx_test_x1, mx_test_x2,  subok=False)
                 assertRaises(NotImplementedError, getattr(np, func), mx_test_x1, mx_test_x2,  dtype=onp.int8)
                 assertRaises(TypeError, getattr(np, func), mx_test_x1, mx_test_x2,  dtype="abcdefg")
                 assertRaises(NotImplementedError, getattr(np, func), mx_test_x1, mx_test_x2,  casting='safe')
                 assertRaises(TypeError, getattr(np, func), mx_test_x1, mx_test_x2,  casting='mxnet')
                 assertRaises(NotImplementedError, getattr(np, func), mx_test_x1, mx_test_x2,  order='C')
                 assertRaises(NotImplementedError, getattr(np, func), mx_test_x1, mx_test_x2,  order='mxnet')

     funcs = {
         'add': (-1.0, 1.0, [lambda y, x1, x2: onp.ones(y.shape)], None),
         'subtract':
         (-1.0, 1.0, [lambda y, x1, x2: onp.ones(y.shape)],
                     [lambda y, x1, x2: -onp.ones(y.shape)]),
         'multiply': (-1.0, 1.0, [lambda y, x1, x2: onp.broadcast_to(x2, y.shape)],
                                 [lambda y, x1, x2: onp.broadcast_to(x1, y.shape)]),
         'divide': (0.1, 1.0, [lambda y, x1, x2: onp.ones(y.shape) / x2],
                    [lambda y, x1, x2: -x1 / (x2 * x2)]),
         'floor_divide': (0.1, 1.0, [lambda y, x1, x2: onp.zeros(y.shape)],
                  [lambda y, x1, x2: onp.zeros(y.shape)]),
         'mod': (1.0, 10.0,
                 [lambda y, x1, x2: onp.ones(y.shape),
                  lambda y, x1, x2: onp.zeros(y.shape)],
                 [lambda y, x1, x2: -onp.floor(x1 / x2),
                  lambda y, x1, x2: onp.zeros(y.shape)],
                 [[onp.float16, onp.float32, onp.float64], [onp.int32]]),
         'fmod': (1.0, 10.0,
                 [lambda y, x1, x2: onp.ones(y.shape),
                  lambda y, x1, x2: onp.zeros(y.shape)],
                 [lambda y, x1, x2: -onp.floor(x1 / x2),
                  lambda y, x1, x2: onp.zeros(y.shape)],
                 [[onp.float16, onp.float32, onp.float64], [onp.int32]]),
         'remainder': (1.0, 10.0,
                       [lambda y, x1, x2: onp.ones(y.shape),
                        lambda y, x1, x2: onp.zeros(y.shape)],
                       [lambda y, x1, x2: -onp.floor(x1 / x2),
                        lambda y, x1, x2: onp.zeros(y.shape)],
                       [[onp.float16, onp.float32, onp.float64], [onp.int32]]),
         'power': (1.0, 3.0, [lambda y, x1, x2: onp.power(x1, x2 - 1.0) * x2],
                              [lambda y, x1, x2: onp.power(x1, x2) * onp.log(x1)]),
         'gcd': (-100, 100, [None], None, [[onp.int32]]),
         'lcm': (-100, 100, [None], None, [[onp.int32]]),
         'bitwise_and': (-100, 100, [None], None, [[onp.int32]]),
         'bitwise_xor': (-100, 100, [None], None, [[onp.int32]]),
         'bitwise_or': (-100, 100, [None], None, [[onp.int32]]),
         'maximum': (-10, 10, [lambda y, x1, x2: onp.ones(y.shape) * (x1 >= x2)],
                              [lambda y, x1, x2: onp.ones(y.shape) * (x1 < x2)],
                              [[onp.int32, onp.float16, onp.float32, onp.float64]]),
         'fmax': (-1, 1, [lambda y, x1, x2: onp.ones(y.shape) * (x1 >= x2)],
                         [lambda y, x1, x2: onp.ones(y.shape) * (x1 < x2)]),
         'minimum': (-10, 10, [lambda y, x1, x2: onp.ones(y.shape) * (x1 <= x2)],
                              [lambda y, x1, x2: onp.ones(y.shape) * (x1 > x2)],
                              [[onp.int32, onp.float16, onp.float32, onp.float64]]),
         'fmin': (-1, 1, [lambda y, x1, x2: onp.ones(y.shape) * (x1 <= x2)],
                         [lambda y, x1, x2: onp.ones(y.shape) * (x1 > x2)]),
         'copysign': (-1, 1,
                      [lambda y, x1, x2: onp.ones(y.shape) * (((x1 * x2) >= 0).astype(onp.float32) - ((x1 * x2) < 0).astype(onp.float32))],
                      [lambda y, x1, x2: onp.zeros(y.shape)]),
         'arctan2': (-1, 1, [lambda y, x1, x2: x2 / (onp.square(x1) + onp.square(x2))],
                            [lambda y, x1, x2: -x1 / (onp.square(x1) + onp.square(x2))]),
         'hypot': (-1, 1, [lambda y, x1, x2: x1 / y],
                          [lambda y, x1, x2: x2 / y]),
         'ldexp': (-3, 3, [None], None, [[onp.int32]]),
         'logaddexp': (-10, 10, [lambda y, x1, x2: onp.exp(x1) / (onp.exp(x1) + onp.exp(x2))],
                                [lambda y, x1, x2: onp.exp(x2) / (onp.exp(x1) + onp.exp(x2))])
     }
     if is_op_runnable():
         funcs['logical_and'] = (-100, 100, [None], None, [[onp.float32, onp.float64]])
         funcs['logical_or'] = (-100, 100, [None], None, [[onp.float32, onp.float64]])
         funcs['logical_xor'] = (-100, 100, [None], None, [[onp.float32, onp.float64]])
     shape_pairs = [((3, 2), (3, 2)),
                    ((3, 2), (3, 1)),
                    ((3, 1), (3, 0)),
                    ((0, 2), (1, 2)),
                    ((2, 3, 4), (3, 1)),
                    ((2, 3), ()),
                    ((), (2, 3))]
     for lshape, rshape in shape_pairs:
         for func, func_data in funcs.items():
             dtypes = None
             assert (len(func_data) == 4 or len(func_data) == 5)
             if len(func_data) is 4:
                 low, high, lgrads, rgrads = func_data
             else:
                 low, high, lgrads, rgrads, dtypes = func_data
             check_binary_func(func, lshape, rshape, low, high, lgrads, rgrads, dtypes)


 @use_np
 def test_np_mixed_precision_binary_funcs():
     itypes = [np.bool, np.int8, np.int32, np.int64]
     ftypes = [np.float16, np.float32, np.float64]
     def check_mixed_precision_binary_func(func, low, high, lshape, rshape, lgrad, rgrad, ltype, rtype):
         class TestMixedBinary(HybridBlock):
             def __init__(self, func):
                 super(TestMixedBinary, self).__init__()
                 self._func = func

             def forward(self, a, b, *args, **kwargs):
                 return getattr(np, self._func)(a, b)

         if (func in ['multiply', 'mod', 'equal', 'not_equal', 'greater',
                     'greater_equal', 'less', 'less_equal']) and \
             (lshape == () or rshape == ()) :
         # the behaviors of infer type in dealing with the input shape of '()' are different between np and onp
         # for example,
         # mx_test_x1 = np.random.uniform(-2, 2, (2,3)).astype(np.float32)
         # mx_test_x2 = np.random.uniform(-2, 2, ()).astype(np.float16)
         # np_out = onp.mod(mx_test_x1.asnumpy(), mx_test_x2.asnumpy()) # float16
         # mx_out = np.mod(mx_test_x1, mx_test_x2) # float32

         # logcial ops: when two numbers are only different in precision, NumPy also has a weird behavior
         # for example,
         # a = np.array([[1.441]], dtype = np.float16)
         # b = np.array(1.4413278, dtype = np.float32)
         # c = np.array([1.4413278], dtype = np.float32)
         # np.greater(a,b), np.greater(a,c) # True True
         # onp.greater(a.asnumpy(),b.asnumpy()), onp.greater(a.asnumpy(),c.asnumpy()) # False True

         # thus, skip the tests
             return

         np_func = getattr(onp, func)
         mx_func = TestMixedBinary(func)
         np_test_x1 = onp.random.uniform(low, high, lshape).astype(ltype)
         np_test_x2 = onp.random.uniform(low, high, rshape).astype(rtype)
         mx_test_x1 = mx.numpy.array(np_test_x1, dtype=ltype)
         mx_test_x2 = mx.numpy.array(np_test_x2, dtype=rtype)
         rtol = 1e-2 if ltype is np.float16 or rtype is np.float16 else 1e-3
         atol = 1e-3 if ltype is np.float16 or rtype is np.float16 else 1e-5
         for hybridize in [True, False]:
             if hybridize:
                 mx_func.hybridize()
             if lgrad:
                 mx_test_x1.attach_grad()
                 mx_test_x2.attach_grad()
             np_out = np_func(np_test_x1, np_test_x2)
             with mx.autograd.record():
                 y = mx_func(mx_test_x1, mx_test_x2)
             assert y.shape == np_out.shape
             assert_almost_equal(y.asnumpy(), np_out.astype(y.dtype), rtol=rtol, atol=atol,
                                 use_broadcast=False, equal_nan=True)

             if lgrad:
                 if (ltype in itypes) and (rtype in itypes):
                     continue
                 y.backward()
                 if ltype not in itypes:
                     assert_almost_equal(mx_test_x1.grad.asnumpy(),
                                         collapse_sum_like(lgrad(y.asnumpy(), np_test_x1, np_test_x2), mx_test_x1.shape),
                                         rtol=1e-1, atol=1e-2, equal_nan=True, use_broadcast=False)
                 if rtype not in itypes:
                     if rgrad is None:
                         assert_almost_equal(mx_test_x2.grad.asnumpy(),
                                             collapse_sum_like(rgrad(y.asnumpy(), np_test_x2, np_test_x1), mx_test_x2.shape),
                                             rtol=1e-1, atol=1e-2, equal_nan=True, use_broadcast=False)
                     else:
                         assert_almost_equal(mx_test_x2.grad.asnumpy(),
                                             collapse_sum_like(rgrad(y.asnumpy(), np_test_x1, np_test_x2), mx_test_x2.shape),
                                             rtol=1e-1, atol=1e-2, equal_nan=True, use_broadcast=False)


         np_out = getattr(onp, func)(np_test_x1, np_test_x2)
         mx_out = getattr(mx.np, func)(mx_test_x1, mx_test_x2)
         assert mx_out.shape == np_out.shape
         assert_almost_equal(mx_out.asnumpy(), np_out.astype(mx_out.dtype), rtol=rtol, atol=atol,
                             use_broadcast=False, equal_nan=True)

     funcs = {
         'add': (-1.0, 1.0, lambda y, x1, x2: onp.ones(y.shape),
                            lambda y, x1, x2: onp.ones(y.shape)),
         'subtract': (-1.0, 1.0, lambda y, x1, x2: onp.ones(y.shape),
                                 lambda y, x1, x2: onp.ones(y.shape) * -1),
         'multiply': (-1.0, 1.0, lambda y, x1, x2: onp.broadcast_to(x2, y.shape),
                                 lambda y, x1, x2: onp.broadcast_to(x1, y.shape)),
         'mod': (1.0, 5.0, None, None),
         'power': (1.0, 3.0, lambda y, x1, x2: onp.power(x1, x2 - 1.0) * x2,
                             lambda y, x1, x2: onp.power(x1, x2) * onp.log(x1)),
         'equal': (0.0, 2.0, None, None),
         'not_equal': (0.0, 2.0, None, None),
         'greater': (0.0, 2.0, None, None),
         'less': (0.0, 2.0, None, None),
         'greater_equal': (0.0, 2.0, None, None),
         'less_equal': (0.0, 2.0, None, None),
         'logical_and': (0.0, 2.0, None, None),
         'logical_or': (0.0, 2.0, None, None),
         'logical_xor': (0.0, 2.0, None, None),
     }

     shape_pairs = [((3, 2), (3, 2)),
                    ((3, 2), (3, 1)),
                    ((3, 0), (3, 0)),
                    ((3, 1), (3, 0)),
                    ((0, 2), (1, 2)),
                    ((2, 3, 4), (3, 1)),
                    ((2, 3), ()),
                    ((), (2, 3))]

     itypes = [np.bool, np.int8, np.int32, np.int64]
     ftypes = [np.float16, np.float32, np.float64]
     for func, func_data in funcs.items():
         low, high, lgrad, rgrad = func_data
         for lshape, rshape in shape_pairs:
             for type1, type2 in itertools.product(itypes, ftypes):
                 check_mixed_precision_binary_func(func, low, high, lshape, rshape, lgrad, rgrad, type1, type2)
                 check_mixed_precision_binary_func(func, low, high, lshape, rshape, lgrad, rgrad, type2, type1)

             for type1, type2 in itertools.product(ftypes, ftypes):
                 if type1 == type2:
                     continue
                 check_mixed_precision_binary_func(func, low, high, lshape, rshape, lgrad, rgrad, type1, type2)

             if func == 'subtract' or func == 'mod':
                 continue
             for type1, type2 in itertools.product(itypes, itypes):
                 if type1 == type2:
                     continue
                 check_mixed_precision_binary_func(func, low, high, lshape, rshape, lgrad, rgrad, type1, type2)

 @use_np
 def test_np_mixed_mxnp_op_funcs():
     # generate onp & mx_np in same type
     _np = onp.array([1,2,3,4,5]).astype("int64")
     mx_np = mx.np.array([1,2,3,4,5]).astype("int64")
     # inplace onp mx_np
     _np += mx_np
     assert isinstance(_np, onp.ndarray)
     _np -= mx_np
     assert isinstance(_np, onp.ndarray)
     _np *= mx_np
     assert isinstance(_np, onp.ndarray)
     # inplace mx_np onp
     mx_np ^= _np
     assert isinstance(mx_np, mx.np.ndarray)
     mx_np |= _np
     assert isinstance(mx_np, mx.np.ndarray)
     mx_np &= _np
     assert isinstance(mx_np, mx.np.ndarray)
     # mxnp onp
     out = mx_np << _np
     assert isinstance(out, mx.np.ndarray)
     out = mx_np >> _np
     assert isinstance(out, mx.np.ndarray)
     out = mx_np != _np
     assert isinstance(out, mx.np.ndarray)
     # onp mxnp
     out = _np == mx_np
     assert isinstance(out, mx.np.ndarray)
     out = _np >= mx_np
     assert isinstance(out, mx.np.ndarray)
     out = _np < mx_np
     assert isinstance(out, mx.np.ndarray)
     _np = onp.array([1,2,3,4,5]).astype("float32")
     mx_np = mx.np.array([1,2,3,4,5]).astype("float32")
     out = _np @ mx_np
     assert isinstance(out, mx.np.ndarray)
     out = _np / mx_np
     assert isinstance(out, mx.np.ndarray)

 @use_np
 def test_np_binary_scalar_funcs():
     itypes = [np.int8, np.int32, np.int64]
     def check_binary_scalar_func(func, low, high, lshape, lgrad, ltype, scalar_is_int, hybridize):
         class TestBinaryScalar(HybridBlock):
             def __init__(self, func, scalar):
                 super(TestBinaryScalar, self).__init__()
                 self._func = func
                 self._scalar = scalar

             def forward(self, a, *args, **kwargs):
                 return getattr(np, self._func)(a, self._scalar)

         np_test_x1 = onp.random.uniform(low, high, lshape).astype(ltype)
         np_test_x2 = int(onp.random.uniform(low, high)) if scalar_is_int else onp.random.uniform(low, high)
         mx_test_x1 = np.array(np_test_x1, dtype=ltype)
         mx_test_x2 = np_test_x2
         np_func = getattr(onp, func)
         mx_func = TestBinaryScalar(func, mx_test_x2)
         if hybridize:
             mx_func.hybridize()
         rtol = 1e-2 if ltype is np.float16 else 1e-3
         atol = 1e-3 if ltype is np.float16 else 1e-5
         if ltype not in itypes:
             if lgrad:
                 mx_test_x1.attach_grad()
             np_out = np_func(np_test_x1, np_test_x2)
             with mx.autograd.record():
                 y = mx_func(mx_test_x1)
             assert y.shape == np_out.shape
             assert_almost_equal(y.asnumpy(), np_out.astype(y.dtype), rtol=rtol, atol=atol)
             if lgrad:
                 y.backward()
                 assert_almost_equal(mx_test_x1.grad.asnumpy(),
                                     collapse_sum_like(lgrad(y.asnumpy(), np_test_x1, np_test_x2), mx_test_x1.shape),
                                     rtol=rtol, atol=atol, equal_nan=True, use_broadcast=False)

         # Test imperative
         np_out = getattr(onp, func)(np_test_x1, np_test_x2)
         mx_out = getattr(mx.np, func)(mx_test_x1, mx_test_x2)
         assert mx_out.shape == np_out.shape
         assert mx_out.asnumpy().dtype == np_out.dtype
         assert_almost_equal(mx_out.asnumpy(), np_out.astype(mx_out.dtype), rtol=rtol, atol=atol)

     funcs = {
         'add': (-1.0, 1.0, None),
         'subtract': (-1.0, 1.0, None),
         'multiply': (-1.0, 1.0, lambda y, x1, x2: onp.broadcast_to(x2, y.shape)),
         'power': (1.0, 5.0, lambda y, x1, x2: onp.power(x1, x2 - 1.0) * x2),
     }

     shapes = [(3, 2), (3, 0), (3, 1), (0, 2), (2, 3, 4)]
     ltypes = [np.int32, np.int64, np.float16, np.float32, np.float64]
     flags = [True, False]
     for func, func_data in funcs.items():
         low, high, lgrad = func_data
         for shape, ltype, is_int, hybridize in itertools.product(shapes, ltypes, flags, flags):
                 check_binary_scalar_func(func, low, high, shape, lgrad, ltype, is_int, hybridize)


 @use_np
 def test_np_boolean_binary_funcs():
     def check_boolean_binary_func(func, mx_x1, mx_x2):
         class TestBooleanBinary(HybridBlock):
             def __init__(self, func):
                 super(TestBooleanBinary, self).__init__()
                 self._func = func

             def forward(self, a, b, *args, **kwargs):
                 return getattr(np, self._func)(a, b)

         np_x1 = mx_x1.asnumpy()
         np_x2 = mx_x2.asnumpy()
         np_func = getattr(onp, func)
         mx_func = TestBooleanBinary(func)
         for hybridize in [True, False]:
             if hybridize:
                 mx_func.hybridize()
             np_out = np_func(np_x1, np_x2)
             with mx.autograd.record():
                 y = mx_func(mx_x1, mx_x2)
             assert y.shape == np_out.shape
             assert_almost_equal(y.asnumpy(), np_out.astype(y.dtype), rtol=1e-3, atol=1e-20,
                                 use_broadcast=False, equal_nan=True)

         np_out = getattr(onp, func)(np_x1, np_x2)
         mx_out = getattr(mx.np, func)(mx_x1, mx_x2)
         assert mx_out.shape == np_out.shape
         assert_almost_equal(mx_out.asnumpy(), np_out.astype(mx_out.dtype), rtol=1e-3, atol=1e-20,
                             use_broadcast=False, equal_nan=True)


     funcs = [
         'add',
         'multiply',
         'true_divide',
     ]

     shape_pairs = [((3, 2), (3, 2)),
                    ((3, 2), (3, 1)),
                    ((3, 1), (3, 0)),
                    ((0, 2), (1, 2)),
                    ((2, 3, 4), (3, 1)),
                    ((2, 3), ()),
                    ((), (2, 3))]

     for lshape, rshape in shape_pairs:
         for func in funcs:
             x1 = np.array(onp.random.uniform(size=lshape) > 0.5)
             x2 = np.array(onp.random.uniform(size=rshape) > 0.5)
             check_boolean_binary_func(func, x1, x2)


 @use_np
 def test_npx_relu():
     def np_relu(x):
         return onp.maximum(x, 0.0)
     def np_relu_grad(x):
         return 1.0 * (x > 0.0)

     class TestReLU(HybridBlock):
         def __init__(self):
             super(TestReLU, self).__init__()

         def forward(self, a):
             return npx.relu(a)

     shapes = [(), (2, 3, 4), (2, 0, 3), (1, 0, 0)]
     for hybridize in [True, False]:
         for shape in shapes:
             test_relu = TestReLU()
             if hybridize:
                 test_relu.hybridize()
             x = rand_ndarray(shape).as_np_ndarray()
             x.attach_grad()
             np_out = np_relu(x.asnumpy())
             with mx.autograd.record():
                 mx_out = test_relu(x)
             assert mx_out.shape == np_out.shape
             assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)
             mx_out.backward()
             np_backward = np_relu_grad(x.asnumpy())
             assert_almost_equal(x.grad.asnumpy(), np_backward, rtol=1e-3, atol=1e-5)

             mx_out = npx.relu(x)
             np_out = np_relu(x.asnumpy())
             assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)


 @use_np
 def test_npx_activation_log_sigmoid():
     def np_log_sigmoid(x):
         return onp.log(onp.divide(1.0, (1.0 + onp.exp(-x))))
     def np_log_sigmoid_grad(x):
         return onp.divide(1.0, onp.add(1.0, onp.exp(x)))

     class TestLogSigmoid(HybridBlock):
         def __init__(self):
             super(TestLogSigmoid, self).__init__()

         def forward(self, a):
             return npx.activation(a, act_type='log_sigmoid')

     shapes = [(), (2, 3, 4)]
     for hybridize in [True, False]:
         for shape in shapes:
             test_log_sigmoid = TestLogSigmoid()
             if hybridize:
                 test_log_sigmoid.hybridize()
             x = rand_ndarray(shape).as_np_ndarray()
             x.attach_grad()
             np_out = np_log_sigmoid(x.asnumpy())
             with mx.autograd.record():
                 mx_out = test_log_sigmoid(x)
             assert mx_out.shape == np_out.shape
             assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)
             mx_out.backward()
             np_backward = np_log_sigmoid_grad(x.asnumpy())
             assert_almost_equal(x.grad.asnumpy(), np_backward, rtol=1e-3, atol=1e-5)

             mx_out = npx.activation(x, act_type='log_sigmoid')
             np_out = np_log_sigmoid(x.asnumpy())
             assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)


 @use_np
 def test_npx_activation_mish():
     def np_mish(a):
         return a * onp.tanh(onp.log1p(onp.exp(a)))
     def np_mish_grad(a):
         softrelu = onp.log1p(onp.exp(a))
         tanh = onp.tanh(softrelu)
         sigmoid = onp.divide(1.0, (1.0 + onp.exp(-a)))
         return tanh + a * sigmoid * (1.0 - tanh * tanh)

     class TestMish(HybridBlock):
         def __init__(self):
             super(TestMish, self).__init__()

         def forward(self, a):
             return npx.activation(a, act_type='mish')

     shapes = [(), (2, 3, 4)]
     for hybridize in [True, False]:
         for shape in shapes:
             test_mish = TestMish()
             if hybridize:
                 test_mish.hybridize()
             x = rand_ndarray(shape).as_np_ndarray()
             x.attach_grad()
             np_out = np_mish(x.asnumpy())
             with mx.autograd.record():
                 mx_out = test_mish(x)
             assert mx_out.shape == np_out.shape
             assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)
             mx_out.backward()
             np_backward = np_mish_grad(x.asnumpy())
             assert_almost_equal(x.grad.asnumpy(), np_backward, rtol=1e-3, atol=1e-5)

             mx_out = npx.activation(x, act_type='mish')
             np_out = np_mish(x.asnumpy())
             assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)


 @use_np
 def test_npx_sigmoid():
     def np_sigmoid(x):
         return onp.divide(1.0, (1.0 + onp.exp(-x)))
     def np_sigmoid_grad(ya):
         return ya * (1 - ya)

     class TestSigmoid(HybridBlock):
         def __init__(self):
             super(TestSigmoid, self).__init__()

         def forward(self, a):
             return npx.sigmoid(a)

     shapes = [(), (2, 3, 4), (2, 0, 3), (1, 0, 0)]
     for hybridize in [True, False]:
         for shape in shapes:
             test_sigmoid = TestSigmoid()
             if hybridize:
                 test_sigmoid.hybridize()
             x = rand_ndarray(shape).as_np_ndarray()
             x.attach_grad()
             np_out = np_sigmoid(x.asnumpy())
             with mx.autograd.record():
                 mx_out = test_sigmoid(x)
             assert mx_out.shape == np_out.shape
             assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)
             mx_out.backward()
             np_backward = np_sigmoid_grad(np_out)
             assert_almost_equal(x.grad.asnumpy(), np_backward, rtol=1e-3, atol=1e-5)

             mx_out = npx.sigmoid(x)
             np_out = np_sigmoid(x.asnumpy())
             assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)


 @use_np
 def test_np_atleast_nd():
     class TestAtleastND(HybridBlock):
         def __init__(self, n):
             super(TestAtleastND, self).__init__()
             self._n = n

         def forward(self, *arys):
             if self._n == 1:
                 return np.atleast_1d(*arys)
             elif self._n == 2:
                 return np.atleast_2d(*arys)
             elif self._n == 3:
                 return np.atleast_3d(*arys)

     tensor_shapes = [
         ((), (2,), (3, 4, 5)),
         ((2, 3, 4, 5), (), (2, 3))
     ]
     flags = [True, False]
     ns = [1, 2, 3]
     dtypes = ['int32', 'int64', 'float16', 'float32', 'float64']
     funcs = {
         "numpy": {1: lambda *ts: onp.atleast_1d(*ts),
                   2: lambda *ts: onp.atleast_2d(*ts),
                   3: lambda *ts: onp.atleast_3d(*ts)},
         "mxnet": {1: lambda *ts: np.atleast_1d(*ts),
                   2: lambda *ts: np.atleast_2d(*ts),
                   3: lambda *ts: np.atleast_3d(*ts)}
     }
     for hybridize, n, tensor_shape, dtype in \
         itertools.product(flags, ns, tensor_shapes, dtypes):
         test_atleast_nd = TestAtleastND(n)
         if hybridize:
             test_atleast_nd.hybridize()
         if dtype in ['int32', 'int64']:
             tensors = list(map(lambda s: np.random.randint(-1, 1, size=s, dtype=dtype), tensor_shape))
         else:
             tensors = list(map(lambda s: np.random.uniform(-1.0, 1.0, size=s, dtype=dtype), tensor_shape))
         tensors_np = [t.asnumpy() for t in tensors]
         mx_out = test_atleast_nd(*tensors)
         np_out = funcs["numpy"][n](*tensors_np)
         for i in range(len(tensors)):
             assert mx_out[i].shape == np_out[i].shape
             assert same(mx_out[i].asnumpy(), np_out[i])

         mx_out = funcs["mxnet"][n](*tensors)
         np_out = funcs["numpy"][n](*tensors_np)
         for i in range(len(tensors)):
             assert mx_out[i].shape == np_out[i].shape
             assert same(mx_out[i].asnumpy(), np_out[i])


 @use_np
 def test_np_arange():
     configs = [
         (1, 10, 2),
         (1, 10, 4),
         (1, -10, 4),
         (1, -10, -2),
         (1, -10, -4),
         (2, 3),
         (2, -3),
         (-2, -3),
         (-2, 3),
         (4, 0, 5),
         (-4, 0, 5),
         (-4, 0, -5),
         (0, 0),
         (11, 11),
         (0, 0, 2),
         (0, 0, -2),
         (0, 5, None),
         (0, -5, None),
         0,
         6,
     ]
     dtypes = ['int32', 'float16', 'float32', 'float64', None]
     for config in configs:
         for dtype in dtypes:
             if isinstance(config, tuple):
                 mx_ret = np.arange(*config, dtype=dtype)
                 np_ret = onp.arange(*config, dtype=dtype)
             else:
                 mx_ret = np.arange(config, dtype=dtype)
                 np_ret = onp.arange(config, dtype=dtype)
             assert same(mx_ret.asnumpy(), np_ret)

     class TestRange(HybridBlock):
         def __init__(self, start, stop=None, step=None, dtype=None):
             super(TestRange, self).__init__()
             self._start = start
             self._stop = stop
             self._step = step
             self._dtype = dtype

         def forward(self, x):
             return x + np.arange(self._start, self._stop, self._step, dtype=self._dtype)

     for dtype in dtypes:
         x = np.zeros(shape=(), dtype=dtype)
         for config in configs:
             for hybridize in [False, True]:
                 if isinstance(config, tuple):
                     net = TestRange(*config, dtype=dtype)
                     np_out = onp.arange(*config, dtype=dtype)
                 else:
                     net = TestRange(config, dtype=dtype)
                     np_out = onp.arange(config, dtype=dtype)
                 if hybridize:
                     net.hybridize()
                 mx_out = net(x)
                 assert same(mx_out.asnumpy(), np_out)


 @use_np
 def test_np_insert():
     class TestInsert(HybridBlock):
         def __init__(self, obj, axis=None):
             super(TestInsert, self).__init__()
             self._obj = obj
             self._axis = axis

         def forward(self, a, b):
             return np.insert(a, self._obj, b, axis=self._axis)

     def GetSize(tp):
         res = 1
         for x in tp:
             res = res * x
         return res

     def GetNdim(tp):
         return len(tp)

     A = (3, 2)
     B = (2)
     C = (2, 2)
     D = (2, 3)
     E = (1)
     F = (3, 1)
     G = (3, 2)
     H = (2, 2, 3, 8)
     config = []
     # test scale index
     for idx in range(-1 * GetSize(A), GetSize(A) + 1):
         config.append(tuple([A, idx, B, None]))
         config.append(tuple([A, idx, E, None]))
         config.append(tuple([A, idx, 1, None]))
     for idx in range(-1 * A[0], A[0] + 1):
         config.append(tuple([A, idx, C, 0]))
         config.append(tuple([A, idx, E, 0]))
         config.append(tuple([A, idx, F, 0]))
         config.append(tuple([A, idx, 1, 0]))
     for idx in range(-1 * A[1], A[1] + 1):
         config.append(tuple([A, idx, D, 1]))
         config.append(tuple([A, idx, E, 1]))
         config.append(tuple([A, idx, F, 1]))
         config.append(tuple([A, idx, 1, 1]))
     # test tuple of indices with size = 1
     for idx in range(-1 * GetSize(A), GetSize(A) + 1):
         config.append(tuple([A, [idx], B, None]))
         config.append(tuple([A, [idx], E, None]))
         config.append(tuple([A, [idx], 1, None]))
     for idx in range(-1 * A[0], A[0] + 1):
         config.append(tuple([A, [idx], C, 0]))
         config.append(tuple([A, [idx], E, 0]))
         config.append(tuple([A, [idx], F, 0]))
         config.append(tuple([A, [idx], 1, 0]))
     for idx in range(-1 * A[1], A[1] + 1):
         config.append(tuple([A, [idx], G, 1]))
         config.append(tuple([A, [idx], E, 1]))
         config.append(tuple([A, [idx], F, 1]))
         config.append(tuple([A, [idx], 1, 1]))
     # test tuple of indices with size > 1
     for ax in range(-1 * GetNdim(A), GetNdim(A)):
         idx = onp.random.randint(-1 * A[ax], A[ax] + 1, size = (3)).tolist()
         config.append(tuple([A, idx, F, ax]))
         config.append(tuple([A, idx, 1, ax]))
         config.append(tuple([A, slice(0, 3), F, ax]))
         config.append(tuple([A, slice(0, 3), 1, ax]))
     # test multidimensional array and unequal dimensions case
     config.append(tuple([H, 0, D, 3]))
     config.append(tuple([H, 0, 1, 3]))
     config.append(tuple([H, [1], E, 2]))
     config.append(tuple([H, [1], 1, 2]))
     idx = onp.random.randint(-1 * H[3], H[3] + 1, size = (5)).tolist()
     config.append(tuple([H, idx, E, 3]))
     config.append(tuple([H, idx, 1, 3]))
     # test slice
     for st in [-5, -3, -1, 0, 1, 3, 5, None]:
         for ed in [-5, -3, -1, 0, 1, 3, 5, None]:
             for stp in [-1, 1, 2, None]:
                 config.append(tuple([A, slice(st, ed, stp), F, 1]))
     dtypes = ['int32', 'float16', 'float32', 'float64', None]

     for arr_shape, obj, val_shape, axis in config:
         for atype, btype in itertools.product(dtypes, dtypes):
             if type(obj) == list:
                 obj_mxnp = np.array(obj, dtype='int64')
                 obj_onp = onp.array(obj)
             elif type(obj) == slice:
                 obj_mxnp = obj
                 obj_onp = obj
             else:  # integer
                 obj_mxnp = obj
                 obj_onp = obj
             test_insert = TestInsert(obj=obj_mxnp, axis=axis)

             a = mx.nd.random.uniform(-10.0, 10.0, shape=arr_shape).as_np_ndarray().astype(atype)
             a.attach_grad()
             b = mx.nd.random.uniform(-10.0, 10.0, shape=val_shape).as_np_ndarray().astype(btype)
             b.attach_grad()
             expected_ret = onp.insert(a.asnumpy(), obj_onp, b.asnumpy(), axis=axis)
             with mx.autograd.record():
                 y = test_insert(a, b)

             assert y.shape == expected_ret.shape
             assert_almost_equal(y.asnumpy(), expected_ret, rtol=1e-3, atol=1e-5)

             #test imperative
             mx_out = np.insert(a, obj_mxnp, b, axis=axis)
             np_out = onp.insert(a.asnumpy(), obj_onp, b.asnumpy(), axis=axis)

             assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)


 @use_np
 def test_np_split():
     class TestSplit(HybridBlock):
         def __init__(self, indices_or_sections, axis=None):
             super(TestSplit, self).__init__()
             self._axis = axis
             self._indices_or_sections = indices_or_sections

         def forward(self, a, *args, **kwargs):
             return np.split(a, indices_or_sections=self._indices_or_sections,
                               axis=self._axis)

     def get_indices(axis_size):
         if axis_size is 0:
             axis_size = random.randint(3, 6)
         samples = random.randint(1, axis_size - 1)
         indices = sorted(random.sample([i for i in range(1, axis_size)], samples))
         indices = tuple(indices)
         return indices

     dim = random.randint(0, 3)
     shape = [0] + [random.randint(2, 4) for i in range(dim)]
     for hybridize in [True, False]:
         for axis in range(-len(shape)+1, len(shape)):
             indices = get_indices(shape[axis])
             sections = 7 if shape[axis] is 0 else shape[axis]
             for indices_or_sections in [indices, sections]:
                 # test gluon
                 test_split = TestSplit(axis=axis, indices_or_sections=indices_or_sections)
                 if hybridize:
                     test_split.hybridize()

                 a = mx.nd.random.uniform(-1.0, 1.0, shape=shape).as_np_ndarray()
                 a.attach_grad()
                 expected_ret = onp.split(a.asnumpy(), indices_or_sections=indices_or_sections, axis=axis)
                 with mx.autograd.record():
                     y = test_split(a)
                 assert len(y) == len(expected_ret)
                 for mx_out, np_out in zip(y, expected_ret):
                     assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)

                 mx.autograd.backward(y)

                 assert_almost_equal(a.grad.asnumpy(), onp.ones(a.shape), rtol=1e-3, atol=1e-5)

                 # test imperative
                 mx_outs = np.split(a, indices_or_sections=indices_or_sections, axis=axis)
                 np_outs = onp.split(a.asnumpy(), indices_or_sections=indices_or_sections, axis=axis)
                 for mx_out, np_out in zip(mx_outs, np_outs):
                     assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)


 @use_np
 def test_np_array_split():
     class TestArray_split(HybridBlock):
         def __init__(self, indices_or_sections, axis=None):
             super(TestArray_split, self).__init__()
             self._axis = axis
             self._indices_or_sections = indices_or_sections

         def forward(self, a, *args, **kwargs):
             return np.array_split(a, indices_or_sections=self._indices_or_sections,
                               axis=self._axis)

     def get_indices(axis_size):
         if axis_size is 0:
             axis_size = random.randint(3, 6)
         samples = random.randint(1, axis_size - 1)
         indices = sorted(random.sample([i for i in range(0, axis_size + 1)], samples))
         indices = tuple(indices)
         return indices

     shapes = [(), (5, ), (10, ),
               (2, 5), (5, 5), (10, 10),
               (4, 4, 4), (4, 6, 9), (6, 6, 6),
               (7, 8, 9, 10)]
     dtypes = [np.int8, np.uint8, np.int32, np.int64, np.float16, np.float32, np.float64]

     combinations = itertools.product([False, True], shapes, dtypes)
     for hybridize, shape, dtype in combinations:
         rtol = 1e-2 if dtype == np.float16 else 1e-3
         atol = 1e-4 if dtype == np.float16 else 1e-5
         for axis in range(len(shape)):
             x = np.random.uniform(-5.0, 5.0, size=shape).astype(dtype)
             indices = get_indices(shape[axis])
             sections = 7 if x.shape[axis] is 0 else random.randint(1,x.shape[axis])
             for indices_or_sections in [indices, sections]:
                 # test gluon
                 test_array_split = TestArray_split(axis=axis, indices_or_sections=indices_or_sections)
                 if hybridize:
                     test_array_split.hybridize()
                 x.attach_grad()
                 expected_ret = onp.array_split(x.asnumpy(), indices_or_sections=indices_or_sections, axis=axis)
                 with mx.autograd.record():
                     y = test_array_split(x)
                 assert len(y) == len(expected_ret)
                 for mx_out, np_out in zip(y, expected_ret):
                     assert_almost_equal(mx_out.asnumpy(), np_out, rtol=rtol, atol=atol)
                 mx.autograd.backward(y)
                 assert_almost_equal(x.grad.asnumpy(), onp.ones(x.shape), rtol=rtol, atol=atol)

                 # test imperative
                 mx_outs = np.array_split(x, indices_or_sections=indices_or_sections, axis=axis)
                 np_outs = onp.array_split(x.asnumpy(), indices_or_sections=indices_or_sections, axis=axis)
                 for mx_out, np_out in zip(mx_outs, np_outs):
                     assert_almost_equal(mx_out.asnumpy(), np_out, rtol=rtol, atol=atol)


 @use_np
 def test_np_vsplit():
     class TestVsplit(HybridBlock):
         def __init__(self, indices_or_sections):
             super(TestVsplit, self).__init__()
             self._indices_or_sections = indices_or_sections

         def forward(self, a, *args, **kwargs):
             return np.vsplit(a, indices_or_sections=self._indices_or_sections)

     def get_indices(axis_size):
         if axis_size is 0:
             axis_size = random.randint(3, 6)
         samples = random.randint(1, axis_size - 1)
         indices = sorted(random.sample([i for i in range(1, axis_size)], samples))
         indices = tuple(indices)
         return indices

     shapes = [
         (2, 1, 2, 9),
         (4, 3, 3),
         (4, 0, 2),  # zero-size shape
         (0, 3), # first dim being zero
     ]
     for hybridize in [True, False]:
         for shape in shapes:
             axis_size = shape[0]
             indices = get_indices(axis_size)
             sections = 7 if axis_size is 0 else axis_size
             for indices_or_sections in [indices, sections]:
                 # test gluon
                 test_vsplit = TestVsplit(indices_or_sections=indices_or_sections)
                 if hybridize:
                     test_vsplit.hybridize()
                 a = rand_ndarray(shape).as_np_ndarray() # TODO: check type
                 a.attach_grad()
                 expected_ret = onp.vsplit(a.asnumpy(), indices_or_sections=indices_or_sections)
                 with mx.autograd.record():
                     y = test_vsplit(a)
                 assert len(y) == len(expected_ret)
                 for mx_out, np_out in zip(y, expected_ret):
                     assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)

                 mx.autograd.backward(y)

                 assert_almost_equal(a.grad.asnumpy(), onp.ones(a.shape), rtol=1e-3, atol=1e-5)

                 # test imperative
                 mx_outs = np.vsplit(a, indices_or_sections=indices_or_sections)
                 np_outs = onp.vsplit(a.asnumpy(), indices_or_sections=indices_or_sections)
                 for mx_out, np_out in zip(mx_outs, np_outs):
                     assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)


 @use_np
 def test_np_concat():
     class TestConcat(HybridBlock):
         def __init__(self, axis=None):
             super(TestConcat, self).__init__()
             self._axis = axis

         def forward(self, a, *args):
             return np.concatenate([a] + list(args), axis=self._axis)

     def get_new_shape(shape, axis):
         shape_lst = list(shape)
         if axis is not None:
             shape_lst[axis] = random.randint(0, 3)
         return tuple(shape_lst)

     shapes = [(), (0, 0), (2, 3), (2, 1, 3)]
     hybridizes = [True, False]
     axes = [0, 1, -1, None]
     grad_reqs = ['write', 'add', 'null']
     dtypes = [np.float32, np.float64, np.bool]
     combinations = itertools.product(shapes, hybridizes, axes, grad_reqs, dtypes)

     for shape, hybridize, axis, grad_req, dtype in combinations:
         # test gluon
         if shape == () and axis != None:
             continue
         test_concat = TestConcat(axis=axis)
         if hybridize:
             test_concat.hybridize()

         grad_req_c = grad_req
         grad_req_d = grad_req
         if grad_req == 'null':
             ide = random.randint(0, 2)
             grad_req_c = 'write' if ide == 0 else 'add'
             grad_req_c = 'write' if ide == 1 else 'add'

         a = np.random.uniform(-1.0, 1.0, size=get_new_shape(shape, axis)).astype(dtype)
         a.attach_grad(grad_req)
         b = np.random.uniform(-1.0, 1.0, size=get_new_shape(shape, axis)).astype(dtype)
         b.attach_grad(grad_req)
         c = np.random.uniform(-1.0, 1.0, size=get_new_shape(shape, axis)).astype(dtype)
         c.attach_grad(grad_req_c)
         d = np.random.uniform(-1.0, 1.0, size=get_new_shape(shape, axis)).astype(dtype)
         d.attach_grad(grad_req_d)
         expected_ret = onp.concatenate([a.asnumpy(), b.asnumpy(), c.asnumpy(), d.asnumpy()], axis=axis)

         with mx.autograd.record():
             y = test_concat(a, b, c, d)

         assert y.shape == expected_ret.shape
         assert_almost_equal(y.asnumpy(), expected_ret, rtol=1e-3, atol=1e-5)

         y.backward()
         if grad_req != 'null':
             assert_almost_equal(a.grad.asnumpy(), onp.ones(a.shape), rtol=1e-3, atol=1e-5)
         if grad_req != 'null':
             assert_almost_equal(b.grad.asnumpy(), onp.ones(b.shape), rtol=1e-3, atol=1e-5)
         if grad_req_c != 'null':
             assert_almost_equal(c.grad.asnumpy(), onp.ones(c.shape), rtol=1e-3, atol=1e-5)
         if grad_req_d != 'null':
             assert_almost_equal(d.grad.asnumpy(), onp.ones(d.shape), rtol=1e-3, atol=1e-5)

         # test imperative
         mx_out = np.concatenate([a, b, c, d], axis=axis)
         np_out = onp.concatenate([a.asnumpy(), b.asnumpy(), c.asnumpy(), d.asnumpy()], axis=axis)
         assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)


 @use_np
 def test_np_append():
     class TestAppend(HybridBlock):
         def __init__(self, axis=None):
             super(TestAppend, self).__init__()
             self._axis = axis

         def forward(self, a, b):
             return np.append(a, b, axis=self._axis)

     def get_new_shape(shape, axis):
         shape_lst = list(shape)
         if axis is not None:
             shape_lst[axis] = random.randint(0, 3)
         return tuple(shape_lst)

     for shape in [(0, 0), (2, 3), (2, 1, 3)]:
         for hybridize in [True, False]:
             for axis in [0, 1, None]:
                 for grad_req_a in ['write', 'add', 'null']:
                     if grad_req_a == 'null':
                         continue
                     #set grad_req
                     grad_req_b = grad_req_a
                     if grad_req_a == 'null':
                         ide = random.randint(0, 2)
                         grad_req_b = 'write' if ide == 0 else 'add'

                     #test gluon
                     test_append = TestAppend(axis=axis)
                     if hybridize:
                         test_append.hybridize()

                     a = mx.nd.random.uniform(-1.0, 1.0, shape=get_new_shape(shape, axis)).as_np_ndarray()
                     a.attach_grad(grad_req=grad_req_a)
                     b = mx.nd.random.uniform(-1.0, 1.0, shape=get_new_shape(shape, axis)).as_np_ndarray()
                     b.attach_grad(grad_req=grad_req_b)
                     expected_ret = onp.append(a.asnumpy(), b.asnumpy(), axis=axis)

                     with mx.autograd.record():
                         y = test_append(a, b)

                     assert y.shape == expected_ret.shape
                     assert_almost_equal(y.asnumpy(), expected_ret, rtol=1e-3, atol=1e-5)
                     y.backward()

                     if grad_req_a != 'null':
                         assert_almost_equal(a.grad.asnumpy(), onp.ones(a.shape), rtol=1e-3, atol=1e-5)
                     assert_almost_equal(b.grad.asnumpy(), onp.ones(b.shape), rtol=1e-3, atol=1e-5)
                     #test imperative
                     mx_out = np.append(a, b, axis=axis)
                     np_out = onp.append(a.asnumpy(), b.asnumpy(), axis=axis)
                     assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)


 @use_np
 def test_np_stack():
     class TestStack(HybridBlock):
         def __init__(self, axis=None):
             super(TestStack, self).__init__()
             self._axis = axis

         def forward(self, a, *args):
             return np.stack([a] + list(args), axis=self._axis)

     a, b, c, d = mx.sym.Variable("a"), mx.sym.Variable("b"), mx.sym.Variable("c"), mx.sym.Variable("d")
     ret = mx.sym.np.stack([a.as_np_ndarray(), b.as_np_ndarray(), c.as_np_ndarray(), d.as_np_ndarray()])
     assert type(ret) == mx.sym.np._Symbol

     for shape in [(0, 0), (2, 3)]:
         for hybridize in [True, False]:
             for axis in range(2):
                 test_stack = TestStack(axis=axis)
                 if hybridize:
                     test_stack.hybridize()
                 np_a = onp.random.uniform(-1.0, 1.0, shape).astype(onp.float32)
                 np_b = onp.random.uniform(-1.0, 1.0, shape).astype(onp.float32)
                 np_c = onp.random.uniform(-1.0, 1.0, shape).astype(onp.float32)
                 np_d = onp.random.uniform(-1.0, 1.0, shape).astype(onp.float32)

                 mx_a = np.array(np_a)
                 mx_a.attach_grad()
                 mx_b = np.array(np_b)
                 mx_b.attach_grad()
                 mx_c = np.array(np_c)
                 mx_c.attach_grad()
                 mx_d = np.array(np_d)
                 mx_d.attach_grad()
                 expected_ret = onp.stack([np_a, np_b, np_c, np_d], axis=axis)
                 with mx.autograd.record():
                     y = test_stack(mx_a, mx_b, mx_c, mx_d)

                 y.backward()

                 assert_almost_equal(mx_a.grad.asnumpy(), onp.ones(shape), rtol=1e-3, atol=1e-5)
                 assert_almost_equal(mx_b.grad.asnumpy(), onp.ones(shape), rtol=1e-3, atol=1e-5)
                 assert_almost_equal(mx_c.grad.asnumpy(), onp.ones(shape), rtol=1e-3, atol=1e-5)
                 assert_almost_equal(mx_d.grad.asnumpy(), onp.ones(shape), rtol=1e-3, atol=1e-5)

                 np_out = onp.stack([np_a, np_b, np_c, np_d], axis=axis)
                 mx_out = np.stack([mx_a, mx_b, mx_c, mx_d], axis=axis)
                 assert same(mx_out.asnumpy(), np_out)


 @use_np
 def test_np_hstack():
     class TestHStack(HybridBlock):
         def __init__(self):
             super(TestHStack, self).__init__()

         def forward(self, a, *args):
             return np.hstack([a] + list(args))

     def get_new_shape(shape):
         if len(shape) == 0:
             l = random.randint(0,3)
             if l == 0:
                 return shape
             else:
                 return (l,)
         shape_lst = list(shape)
         axis = 1 if len(shape) > 1 else 0
         shape_lst[axis] = random.randint(0, 5)
         return tuple(shape_lst)

     shapes = [
         (),
         (1,),
         (2,1),
         (2,2,4),
         (2,0,0),
         (0,1,3),
         (2,0,3),
         (2,3,4,5)
     ]
     for hybridize in [True, False]:
         for shape in shapes:
             test_hstack = TestHStack()
             if hybridize:
                 test_hstack.hybridize()
             # test symbolic forward
             a = np.random.uniform(size=get_new_shape(shape))
             a.attach_grad()
             b = np.random.uniform(size=get_new_shape(shape))
             b.attach_grad()
             c = np.random.uniform(size=get_new_shape(shape))
             c.attach_grad()
             d = np.random.uniform(size=get_new_shape(shape))
             d.attach_grad()
             with mx.autograd.record():
                 mx_out = test_hstack(a, b, c, d)
             np_out = onp.hstack((a.asnumpy(), b.asnumpy(), c.asnumpy(), d.asnumpy()))
             assert mx_out.shape == np_out.shape
             assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)

             # test symbolic backward
             mx_out.backward()
             assert_almost_equal(a.grad.asnumpy(), onp.ones(a.shape), rtol=1e-3, atol=1e-5)
             assert_almost_equal(b.grad.asnumpy(), onp.ones(b.shape), rtol=1e-3, atol=1e-5)
             assert_almost_equal(c.grad.asnumpy(), onp.ones(c.shape), rtol=1e-3, atol=1e-5)
             assert_almost_equal(d.grad.asnumpy(), onp.ones(d.shape), rtol=1e-3, atol=1e-5)

             mx_out = np.hstack((a, b, c, d))
             np_out = onp.hstack((a.asnumpy(),b.asnumpy(), c.asnumpy(), d.asnumpy()))
             assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)


 @use_np
 def test_np_dstack():
     class TestDStack(HybridBlock):
         def __init__(self):
             super(TestDStack, self).__init__()

         def forward(self, a, *args):
             return np.dstack([a] + list(args))

     def get_new_shape(shape):
         if len(shape) < 3:
             return shape
         axis = 2
         shape_lst = list(shape)
         shape_lst[axis] = random.randint(0, 5)
         return tuple(shape_lst)

     shapes = [
         (),
         (1,),
         (2,1),
         (2,2,4),
         (2,0,0),
         (0,1,3),
         (2,0,3),
         (2,3,4,5)
     ]
     for hybridize in [True, False]:
         for shape in shapes:
             test_dstack = TestDStack()
             if hybridize:
                 test_dstack.hybridize()
             # test symbolic forward
             a = mx.nd.random.uniform(shape=get_new_shape(shape)).as_np_ndarray()
             a.attach_grad()
             b = mx.nd.random.uniform(shape=get_new_shape(shape)).as_np_ndarray()
             b.attach_grad()
             c = mx.nd.random.uniform(shape=get_new_shape(shape)).as_np_ndarray()
             c.attach_grad()
             d = mx.nd.random.uniform(shape=get_new_shape(shape)).as_np_ndarray()
             d.attach_grad()
             with mx.autograd.record():
                 mx_out = test_dstack(a, b, c, d)
             np_out = onp.dstack((a.asnumpy(), b.asnumpy(), c.asnumpy(), d.asnumpy()))
             assert mx_out.shape == np_out.shape
             assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)

             # test symbolic backward
             mx_out.backward()
             assert_almost_equal(a.grad.asnumpy(), onp.ones(a.shape), rtol=1e-3, atol=1e-5)
             assert_almost_equal(b.grad.asnumpy(), onp.ones(b.shape), rtol=1e-3, atol=1e-5)
             assert_almost_equal(c.grad.asnumpy(), onp.ones(c.shape), rtol=1e-3, atol=1e-5)
             assert_almost_equal(d.grad.asnumpy(), onp.ones(d.shape), rtol=1e-3, atol=1e-5)

             # test imperative
             mx_out = np.dstack((a, b, c, d))
             np_out = onp.dstack((a.asnumpy(),b.asnumpy(), c.asnumpy(), d.asnumpy()))
             assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)


 @use_np
 def test_np_ravel():
     class TestRavel(HybridBlock):
         def __init__(self):
             super(TestRavel, self).__init__()

         def forward(self, a):
             return np.ravel(a)

     types = ['float64', 'float32', 'float16', 'int64', 'int32', 'int8']
     for oneType in types:
         for hybridize in [True, False]:
             for shape in [(), (2,), (2, 2), (1, 2, 3), (3, 0), (1, 0, 2)]:
                 test_ravel = TestRavel()
                 if hybridize:
                     test_ravel.hybridize()
                 x = rand_ndarray(shape, dtype=oneType).as_np_ndarray()
                 x.attach_grad()
                 np_out = onp.ravel(x.asnumpy())
                 with mx.autograd.record():
                     mx_out = test_ravel(x)
                 assert mx_out.shape == np_out.shape
                 assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)
                 mx_out.backward()
                 np_backward = onp.ones(shape)
                 assert_almost_equal(x.grad.asnumpy(), np_backward, rtol=1e-3, atol=1e-5)

                 mx_out = np.ravel(x)
                 np_out = onp.ravel(x.asnumpy())
                 assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)


 @use_np
 def test_np_randint():
     device = mx.device.current_device()
     # test shapes
     params = [
         (0, 10),
         (5, None)
     ]
     shapes = [
         None,
         (),
         (3, 3),
         (3, 4),
         (0, 0),
         (3, 3, 3),
         (0, 0, 0),
         (2, 2, 4, 3),
         (2, 2, 4, 3),
         (2, 0, 3, 0),
         (2, 0, 2, 3)
     ]
     for shape in shapes:
         for (low, high) in params:
             data_mx = np.random.randint(low, high, size=shape)
             assert data_mx.shape == (shape if shape is not None else ())

     # test generator
     for dtype in ['int32', 'int64']:
         for low, high in [(50000000, 50001000),(-50000100,-50000000),(-500,199)]:
             scale = high - low
             buckets, probs = gen_buckets_probs_with_ppf(lambda x: ss.uniform.ppf(x, loc=low, scale=scale), 5)
             # Quantize bucket boundaries to reflect the actual dtype and adjust probs accordingly
             buckets = onp.array(buckets, dtype=dtype).tolist()
             probs = [(buckets[i][1] - buckets[i][0]) / float(scale) for i in range(5)]
             generator_mx = lambda x: np.random.randint(low, high, size=x, dtype=dtype, device=device).asnumpy()
             verify_generator(generator=generator_mx, buckets=buckets, probs=probs, nrepeat=100)
             # Scipy uses alpha = 0.01 for testing discrete distribution generator but we are using default alpha=0.05 (higher threshold ensures robustness)
             # Refer - https://github.com/scipy/scipy/blob/9f12af697763fb5f9767d5cb1280ce62456a3974/scipy/stats/tests/test_discrete_basic.py#L45
             generator_mx_same_seed = \
                 lambda x: onp.concatenate(
                     [np.random.randint(low, high, size=x // 10, dtype=dtype, device=device).asnumpy()
                         for _ in range(10)])
             verify_generator(generator=generator_mx_same_seed, buckets=buckets, probs=probs, nrepeat=100)


 @use_np
 def test_np_swapaxes():
     config = [((0, 1, 2), 0, 0),
               ((0, 1, 2), 1, 2),
               ((0, 1, 2), 1, -2),
               ((4, 5, 6, 7), 1, 1),
               ((4, 5, 6, 7), 2, -2),
               ((4, 5, 6, 7), -2, -3)]

     class TestSwapaxes(HybridBlock):
         def __init__(self, axis1, axis2):
             super(TestSwapaxes, self).__init__()
             self._axis1 = axis1
             self._axis2 = axis2

         def forward(self, x):
             return np.swapaxes(x, self._axis1, self._axis2)

     for shape, axis1, axis2 in config:
         data_np = onp.random.uniform(size=shape)
         data_mx = np.array(data_np, dtype=data_np.dtype)
         ret_np = onp.swapaxes(data_np, axis1=axis1, axis2=axis2)
         ret_mx = np.swapaxes(data_mx, axis1=axis1, axis2=axis2)
         assert same(ret_mx.asnumpy(), ret_np)

         net = TestSwapaxes(axis1, axis2)
         for hybrid in [False, True]:
             if hybrid:
                 net.hybridize()
             ret_mx = net(data_mx)
             assert same(ret_mx.asnumpy(), ret_np)


 @use_np
 def test_np_delete():
     class TestDelete(HybridBlock):
         def __init__(self, obj, axis=None):
             super(TestDelete, self).__init__()
             self._obj = obj
             self._axis = axis

         def forward(self, a):
             return np.delete(a, self._obj, axis=self._axis)

     def GetSize(shp):
         if len(shp) == 0:
             return 0
         else:
             res = 1
             shp_list = list(shp)
             for x in shp:
                 res *= x
             return res

     def GetDimSize(shp, axis):
         if axis is None:
             return GetSize(shp)
         shp_list = list(shp)
         return shp_list[axis]

     shape = [(), (0, ), (1, ), (2, 3), (2, 1, 4, 5)]
     config = []
     for shp in shape:
         for ax in range(-1 * len(shp), len(shp), 2):
             #test slice
             for st in [-5, -2, 0, 2, 5, None]:
                 for ed in [-5, -2, 0, 2, 5, None]:
                     for stp in [-5, -2, 2, 5, None]:
                         config.append(tuple([shp, slice(st, ed, stp), None]))
                         config.append(tuple([shp, slice(st, ed, stp), ax]))
             #test iteger
             for idx in range(-1 * GetDimSize(shp, ax), GetDimSize(shp, ax)):
                 config.append(tuple([shp, idx, ax]))
             #test ndarray indices
             idx =  onp.random.randint(-1 * shp[ax], shp[ax] + 1, size = (4)).tolist()
             config.append(tuple([shp, idx, ax]))

     for arr_shape, obj, axis in config:
         for objtype in ['int32', 'int64']:
             if type(obj) == list:
                 obj_mxnp = np.array(obj, dtype=objtype)
                 obj_onp = onp.array(obj, dtype=objtype)
                 # To match mxnet.numpy's behavior of ignoring out-of-bounds indices,
                 # we may need to filter out indices that this numpy would not ignore.
                 onp_ignores_oob_indices = parse(onp.version.version) < parse('1.19')
                 if not onp_ignores_oob_indices:
                     dim_size = GetDimSize(arr_shape,axis)
                     obj_onp = obj_onp[((obj_onp>=0) & (obj_onp<dim_size))]
             elif type(obj) == slice:
                 obj_mxnp = obj
                 obj_onp = obj
             else:
                 obj_mxnp = (onp.int32(obj) if objtype == 'int32' else onp.int64(obj))
                 obj_onp = (onp.int32(obj) if objtype == 'int32' else onp.int64(obj))
             test_delete = TestDelete(obj=obj_mxnp, axis=axis)

             a = mx.nd.random.uniform(-1.0, 1.0, shape=arr_shape).as_np_ndarray()
             a.attach_grad()
             expected_ret = onp.delete(a.asnumpy(), obj_onp, axis=axis)

             with mx.autograd.record():
                 y = test_delete(a)

             assert y.shape == expected_ret.shape
             assert_almost_equal(y.asnumpy(), expected_ret, rtol=1e-3, atol=1e-5)

             #test imperative
             mx_out = np.delete(a, obj_mxnp, axis=axis)
             np_out = onp.delete(a.asnumpy(), obj_onp, axis=axis)

             assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)


 @use_np
 @pytest.mark.parametrize('shape,axis,throw_exception', [
     ((), 0, False),
     ((), -1, False),
     ((), 1, True),
     ((5, 3), None, False),
     ((5, 3), -1, False),
     ((5, 3), 1, False),
     ((5, 3), 3, True),
     ((5, 0, 3), 0, False),
     ((5, 0, 3), -1, False),
     ((5, 0, 3), None, True),
     ((5, 0, 3), 1, True),
     ((3, 5, 7), None, False),
     ((3, 5, 7), 0, False),
     ((3, 5, 7), 1, False),
     ((3, 5, 7), 2, False),
     ((3, 5, 7, 9, 11), -3, False),
 ])
 @pytest.mark.parametrize('dtype', ['float16', 'float32', 'float64', 'bool', 'int32'])
 @pytest.mark.parametrize('op_name', ['argmin', 'argmax'])
 @pytest.mark.parametrize('keepdims', [True, False])
 @pytest.mark.parametrize('hybridize', [True, False])
 def test_np_argmin_argmax(shape, axis, throw_exception, dtype, op_name, keepdims, hybridize):
     class TestArgExtreme(HybridBlock):
         def __init__(self, op_name, axis=None, keepdims=False):
             super(TestArgExtreme, self).__init__()
             self._op_name = op_name
             self._axis = axis
             self.keepdims = keepdims

         def forward(self, x):
             return getattr(x, self._op_name)(self._axis, keepdims=self.keepdims)

     a = np.random.uniform(low=0, high=100, size=shape).astype(dtype)
     if throw_exception:
         with pytest.raises(MXNetError):
             getattr(np, op_name)(a, axis)
             mx.npx.waitall()
     else:
         mx_ret = getattr(np, op_name)(a, axis=axis, keepdims=keepdims)
         np_ret = getattr(onp, op_name)(a.asnumpy(), axis=axis)
         assert mx_ret.dtype == np_ret.dtype
         if keepdims:
             assert same(np.squeeze(mx_ret, axis=axis).asnumpy(), np_ret)
         else:
             assert same(mx_ret.asnumpy(), np_ret)

     net = TestArgExtreme(op_name, axis, keepdims)
     if hybridize:
         net.hybridize()
     if throw_exception:
         with pytest.raises(MXNetError):
             getattr(np, op_name)(a, axis)
             mx.npx.waitall()
     else:
         mx_ret = net(a)
         assert mx_ret.dtype == np_ret.dtype
         if keepdims:
             assert same(np.squeeze(mx_ret, axis=axis).asnumpy(), np_ret)
         else:
             assert same(mx_ret.asnumpy(), np_ret)


 @use_np
 def test_np_argmin_argmax_large_tensor():
     # compare inp[arg] with ext directly because along one axis there might
     # be multiple extrema
     def single_run(op, dtype):
         inp = np.random.normal(0, 10, size=(200, 30000), dtype=dtype)
         arg = op[0](inp, 1)
         ref = op[1](inp, 1)
         for i, idx in enumerate(arg):
             assert inp[i, idx] == ref[i]

     dtypes = ['float16', 'float32', 'float64']
     ops = [(np.argmin, np.amin), (np.argmax, np.amax)]
     for o, d in zip(ops, dtypes):
         single_run(o, d)


 @use_np
 def test_np_clip():
     workloads = [
         ((), None, None, True),
         ((), None, 1, False),
         ((), -1, 1, False),
         ((), -1, None, False),
         ((5, 3), None, 0.1, False),
         ((5, 3), -0.1, None, False),
         ((5, 3), -0.1, 0.1, False),
         ((5, 3), 0, 0, False),
         ((5, 0, 3), 0, None, False),
         ((5, 0, 3), None, -1, False),
         ((5, 0, 3), -1, 0, False),
     ]
     dtypes = ['float32', 'float64']

     class TestClip(HybridBlock):
         def __init__(self, a_min=None, a_max=None):
             super(TestClip, self).__init__()
             self._a_min = a_min
             self._a_max = a_max

         def forward(self, x):
             return x.clip(self._a_min, self._a_max)

     # Test scalar case
     for _, a_min, a_max, throw_exception in workloads:
         a = onp.random.uniform() # A scalar
         if throw_exception:
             # No need to test the exception case here.
             continue
         mx_ret = np.clip(a, a_min, a_max)
         np_ret = onp.clip(a, a_min, a_max)
         assert_almost_equal(mx_ret, np_ret, atol=1e-4, rtol=1e-3, use_broadcast=False)

     for shape, a_min, a_max, throw_exception in workloads:
         for dtype in dtypes:
             a = np.random.uniform(size=shape, dtype=dtype)
             if throw_exception:
                 # Cannot use assert_exception because sometimes the main thread
                 # proceeds to `assert False` before the exception is thrown
                 # in the worker thread. Have to use mx.nd.waitall() here
                 # to block the main thread.
                 try:
                     a.clip(min=a_min, max=a_max)
                     mx.nd.waitall()
                     assert False
                 except:
                     pass
             else:
                 mx_ret = a.clip(min=a_min, max=a_max)
                 np_ret = a.asnumpy().clip(min=a_min, max=a_max)
                 assert_almost_equal(mx_ret.asnumpy(), np_ret, atol=1e-4, rtol=1e-3, use_broadcast=False)

             for hybridize in [False, True]:
                 net = TestClip(a_min, a_max)
                 if hybridize:
                     net.hybridize()
                 if throw_exception:
                     try:
                         net(a)
                         mx.nd.waitall()
                         assert False
                     except:
                         pass
                 else:
                     mx_ret = net(a)
                     assert_almost_equal(mx_ret.asnumpy(), np_ret, atol=1e-4, rtol=1e-3, use_broadcast=False)


 @use_np
 def test_npx_random_bernoulli():
     def _test_bernoulli_exception(prob, logit):
         output = npx.random.bernoulli(prob=prob, logit=logit).asnumpy()

     shapes = [(), (1,), (2, 3), (4, 0, 5), 6, (7, 8), None]
     dtypes = ['float16', 'float32', 'float64', 'int32', 'bool']
     for shape, dtype in itertools.product(shapes, dtypes):
         prob = np.random.uniform(size=shape)
         logit = np.log(prob) - np.log(1 - prob)
         expected_shape = shape
         if not isinstance(shape, tuple):
             expected_shape = () if shape is None else (shape,)
         out_prob = npx.random.bernoulli(prob=prob, size=shape, dtype=dtype)
         assert out_prob.shape == expected_shape
         assert int((out_prob.asnumpy() == 0).sum() + (out_prob.asnumpy() == 1).sum()) == out_prob.size
         out_logit = npx.random.bernoulli(logit=logit, size=shape, dtype=dtype)
         assert out_logit.shape == expected_shape
         assert int((out_logit.asnumpy() == 0).sum() + (out_logit.asnumpy() == 1).sum()) == out_logit.size
         # Test Exception.
         assertRaises(ValueError, _test_bernoulli_exception, prob, logit)
         if prob.size > 0:
             # larger than 1
             assertRaises(ValueError, _test_bernoulli_exception, prob + 2.0, None)
             # smaller than 0
             assertRaises(ValueError, _test_bernoulli_exception, prob - 2.0, None)
             # mixed case
             low, high = (-1.0, 2.0)
             # uniform(-1, 2)
             scaled_prob = low + (high - low) * prob
             if not ((scaled_prob.asnumpy() >= 0).all() and (scaled_prob.asnumpy() <= 1).all()):
                 assertRaises(ValueError, _test_bernoulli_exception, scaled_prob, None)


 @use_np
 def test_npx_constraint_check():
     msg = "condition violated"
     class TestConstraintViolatedCheck(HybridBlock):
         def __init__(self):
             super(TestConstraintViolatedCheck, self).__init__()

         def forward(self, boolean_tensor):
             return npx.constraint_check(boolean_tensor, msg)

     class TestConstraintNotViolatedCheck(HybridBlock):
         def __init__(self):
             super(TestConstraintNotViolatedCheck, self).__init__()

         def forward(self, input, boolean_tensor):
             return input * npx.constraint_check(boolean_tensor, msg)

     def raiseFunc(block):
         def executor(boolean_tensor):
             out = block(boolean_tensor).asnumpy()
         return executor

     shapes = [(1,), (2, 3), 6, (7, 8)]

     expect_success_output = np.array(True)
     for shape, hybridize in itertools.product(shapes, [True, False]):
         test_constraint = TestConstraintViolatedCheck()
         if hybridize:
             test_constraint.hybridize()
         assertRaises(ValueError, raiseFunc(test_constraint), np.zeros(shape, dtype='bool'))

     for shape, hybridize in itertools.product(shapes, [True, False]):
         test_constraint = TestConstraintNotViolatedCheck()
         if hybridize:
             test_constraint.hybridize()
         input_tensor = np.random.normal(size=shape)
         out = test_constraint(input_tensor, np.ones(shape, dtype='bool'))
         assert (input_tensor.asnumpy() == out.asnumpy()).all()


 @use_np
 def test_npx_special_unary_func():
     def check_unary_func(func, ref_grad, shape, low, high):
         class TestUnary(HybridBlock):
             def __init__(self, func):
                 super(TestUnary, self).__init__()
                 self._func = func

             def forward(self, a, *args, **kwargs):
                 return getattr(npx, self._func)(a)

         np_func = getattr(scipy_special, func)
         mx_func = TestUnary(func)
         np_test_data = onp.random.uniform(low, high, shape).astype(onp.float32)
         mx_test_data = mx.numpy.array(np_test_data)
         for hybridize in [True, False]:
             if hybridize:
                 mx_func.hybridize()
             if ref_grad:
                 mx_test_data.attach_grad()
             np_out = np_func(np_test_data)
             with mx.autograd.record():
                 y = mx_func(mx_test_data)
             assert y.shape == np_out.shape
             assert_almost_equal(y.asnumpy(), np_out, rtol=1e-3, atol=1e-5)
             if np_out.dtype == np.bool_:
                 assert y.dtype == np.bool_

             if ref_grad:
                 y.backward()
                 assert_almost_equal(mx_test_data.grad.asnumpy(), ref_grad(np_test_data), rtol=1e-1, atol=1e-2, equal_nan=True)

         np_out = getattr(scipy_special, func)(np_test_data)
         mx_out = getattr(mx.npx, func)(mx_test_data)
         assert mx_out.shape == np_out.shape
         assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)

     import math
     funcs = {
         'erf' : (lambda x: 2.0 / math.sqrt(math.pi) * onp.exp(-(x ** 2)), 0.5, 0.5),
         'erfinv' : (lambda x: 0.5 * math.sqrt(math.pi) * onp.exp(scipy_special.erfinv(x) ** 2), 0.5, 0.5),
         'gamma' : (lambda x: scipy_special.gamma(x) * scipy_special.psi(x), 0.5, 0.5),
         'gammaln' : (lambda x: scipy_special.psi(x), 0.5, 0.5),
         'digamma' : (lambda x: scipy_special.polygamma(1, x), 0.5, 0.5)
     }
     ndim = random.choice([2, 3, 4])
     shape = random.choice([rand_shape_nd(ndim, dim=3), (1, 0, 2)])
     for shape in [rand_shape_nd(ndim, dim=3), (1, 0, 2)]:
         for func, func_data in funcs.items():
             ref_grad, low, high = func_data
             check_unary_func(func, ref_grad, shape, low, high)


 @xfail_when_nonstandard_decimal_separator
 @use_np
 def test_np_random_grad():
     class TestRandomGrad(HybridBlock):
         def __init__(self, shape, op_name):
             super(TestRandomGrad, self).__init__()
             self._shape = shape
             self._dist_name = op_name
         def forward(self, loc, scale):
             op = getattr(np.random, self._dist_name, None)
             assert op is not None
             return op(loc=loc, scale=scale, size=self._shape)

     param_shape = [
         [(3, 2), (3, 2)],
         [(3, 2, 2), (3, 2, 2)],
         [(3, 4, 5), (4, 1)],
     ]
     output_shapes = [
         (3, 2),
         (4, 3, 2, 2),
         (3, 4, 5)
     ]
     op_names = ["normal", "logistic", "gumbel"]
     for op_name in op_names:
         for hybridize in [False, True]:
             for ((shape1, shape2), out_shape) in zip(param_shape, output_shapes):
                 test_random_grad = TestRandomGrad(out_shape, op_name)
                 if hybridize:
                     test_random_grad.hybridize()
                 loc = np.zeros(shape1)
                 loc.attach_grad()
                 scale = np.ones(shape2)
                 scale.attach_grad()
                 with mx.autograd.record():
                     samples = test_random_grad(loc, scale)
                 samples.backward()
                 assert loc.grad.shape == shape1
                 assert scale.grad.shape == shape2
                 assert_almost_equal(loc.grad.asnumpy().sum(), onp.ones(out_shape).sum(), rtol=1e-3, atol=1e-5)

         for (loc, scale) in [(2, (2,3)), ((2,3), 2), ((2,3), (2,3))]:
             if isinstance(loc, tuple):
                 loc = np.ones(loc)
             if isinstance(scale, tuple):
                 scale = np.ones(scale)
             mx_out = getattr(np.random, op_name)(loc, scale)
             np_out = getattr(onp.random, op_name)(loc, scale)
             assert mx_out.asnumpy().shape == np_out.shape


 @use_np
 def test_np_lognormal_grad():
     class TestLognormalGrad(HybridBlock):
         def __init__(self, shape):
             super(TestLognormalGrad, self).__init__()
             self._shape = shape

         def forward(self, mean, sigma):
             return np.random.lognormal(mean, sigma, self._shape)

     param_shape = [
         [(3, 2), (3, 2)],
         [(3, 2, 2), (3, 2, 2)],
         [(3, 4, 5), (4, 1)],
     ]
     output_shapes = [
         (3, 2),
         (4, 3, 2, 2),
         (3, 4, 5)
     ]
     for hybridize in [False, True]:
         for ((shape1, shape2), out_shape) in zip(param_shape, output_shapes):
             test_lognormal_grad = TestLognormalGrad(out_shape)
             if hybridize:
                 test_lognormal_grad.hybridize()
             mean = np.zeros(shape1)
             mean.attach_grad()
             sigma = np.ones(shape2)
             sigma.attach_grad()
             with mx.autograd.record():
                 mx_out = test_lognormal_grad(mean, sigma)
             np_out = onp.random.lognormal(mean = mean.asnumpy(),
                                             sigma = sigma.asnumpy(), size = out_shape)
             assert np_out.shape == mx_out.shape
             mx_out.backward()
             assert mean.grad.shape == shape1
             assert sigma.grad.shape == shape2
             assert_almost_equal(mean.grad.asnumpy().sum(), mx_out.asnumpy().sum(), rtol=1e-3, atol=1e-5)

     for ((shape1, shape2), out_shape) in zip(param_shape, output_shapes):
         mx_out = np.random.lognormal(np.zeros(shape1), np.ones(shape2), out_shape)
         np_out = onp.random.lognormal(np.zeros(shape1).asnumpy(), np.ones(shape2).asnumpy(), out_shape)
         assert mx_out.asnumpy().shape == np_out.shape

     def _test_lognormal_exception(sigma):
         output = np.random.lognormal(sigma=sigma).asnumpy()
     assertRaises(ValueError, _test_lognormal_exception, -1)


 @use_np
 def test_npx_sample_n():
     def shape_formatter(s):
         if s is None:
             return ()
         if isinstance(s, tuple):
             return s
         # scalar case
         return (s,)

     class TestSampleN(HybridBlock):
         def __init__(self, shape, op_name, dtype):
             super(TestSampleN, self).__init__()
             self._shape = shape
             self._op_name = op_name
             self._dtype = dtype

         def forward(self, param1, param2):
             op = getattr(npx.random, self._op_name, None)
             assert op is not None
             return op(param1, param2, batch_shape=self._shape, dtype=self._dtype)

     batch_shapes = [(10,), (2, 3), 6, ()]
     event_shapes = [(), (2,), (2,2)]
     dtypes = ['float16', 'float32', 'float64']
     op_names = ['uniform_n', 'normal_n']

     for bshape, eshape, dtype, op in itertools.product(batch_shapes, event_shapes, dtypes, op_names):
         for hybridize in [True, False]:
             net = TestSampleN(bshape, op, dtype)
             if hybridize:
                 net.hybridize()
             expected_shape = (shape_formatter(bshape) +
                               shape_formatter(eshape))
             out = net(np.ones(shape=eshape), np.ones(shape=eshape))
             assert out.shape == expected_shape


 @use_np
 def test_np_random():
     shapes = [(), (1,), (2, 3), (4, 0, 5), 6, (7, 8), None]
     dtypes = ['float16', 'float32', 'float64']
     op_names = ['uniform', 'normal', 'gamma', 'laplace']
     for shape in shapes:
         for dtype in dtypes:
             for op_name in op_names:
                 op = getattr(np.random, op_name, None)
                 assert op is not None
                 if op_name == 'gamma':
                     out = op(1, size=shape, dtype=dtype)
                 else:
                     out = op(size=shape, dtype=dtype)
                 expected_shape = shape
                 if not isinstance(shape, tuple):
                     expected_shape = () if shape is None else (shape,)
                 assert out.shape == expected_shape

     class TestRandom(HybridBlock):
         def __init__(self, shape, op_name, param=None):
             super(TestRandom, self).__init__()
             self._shape = shape
             self._op_name = op_name
             # In case parameters are not optional
             self._param = param

         def forward(self, x):
             op = getattr(np.random, self._op_name, None)
             assert op is not None
             if self._param is not None:
                 return x + op(self._param, size=self._shape)
             return x + op(size=self._shape)

     x = np.ones(())
     for op_name in op_names:
         for shape in shapes:
             for hybridize in [False, True]:
                 if op_name == "gamma":
                     net = TestRandom(shape, op_name, 1)
                 else:
                     net = TestRandom(shape, op_name)
                 if hybridize:
                     net.hybridize()
                 out = net(x)
                 expected_shape = shape
                 if not isinstance(shape, tuple):
                     expected_shape = () if shape is None else (shape,)
                 assert out.shape == expected_shape


 @use_np
 def test_gamma_exception():
     def _test_gamma_exception(shape, scale):
         return np.random.gamma(shape, scale).asnumpy()

     shape_list = [
         1,
         np.array(1),
         np.array(1),
         0,
         0,
         np.array(0)
     ]
     scale_list = [
         0,
         0,
         np.array(-1.0),
         1,
         np.array(1),
         np.array(1)
     ]
     for (shape, scale) in zip(shape_list, scale_list):
         assertRaises(ValueError, _test_gamma_exception, shape, scale)


 @use_np
 @pytest.mark.parametrize("shape", [(1,), (2, 2), (4, 2, 2)])
 @pytest.mark.parametrize("a", [2.0, 5.0, 10.0])
 @pytest.mark.parametrize("b", [0.5, 1.0, 1.5])
 def test_gamma_grad(shape, a, b):
     class TestGammaGrad(HybridBlock):
         def __init__(self, size, beta):
             super(TestGammaGrad, self).__init__()
             self._size = size
             self._beta = beta

         def forward(self, a):
             return np.random.gamma(a, self._beta, size=self._size)

     for hybridize in [True, False]:
         param = np.ones(shape) * a
         param.attach_grad()
         net = TestGammaGrad(shape, b)
         if hybridize:
             net.hybridize()
         with mx.autograd.record():
             samples = net(param)
         samples.backward()
         # Check shape
         assert param.grad.shape == param.shape
         # Check correctness
         cdf = ss.gamma.cdf
         log_pdf = ss.gamma.logpdf
         eps = (0.01 * param / (1.0 + param ** 0.5)).asnumpy()
         x = samples.asnumpy().astype('float64') / b
         # d(cdf(x;alpha,beta))/d(alpha)
         cdf_alpha = (cdf(x, param.asnumpy() + eps) -
                         cdf(x, param.asnumpy() - eps)) / (2 * eps)
         # d(cdf(x;alpha,beta))/d(x)
         log_cdf_x = log_pdf(x, param.asnumpy())
         expected_grad = -b * cdf_alpha / onp.exp(log_cdf_x)
         assert_almost_equal(expected_grad, param.grad.asnumpy(), rtol=1e-2, atol=1e-3)


 @use_np
 def test_np_random_beta():
     class TestRandomBeta(HybridBlock):
         def __init__(self, size=None, dtype=None, device=None):
             super(TestRandomBeta, self).__init__()
             self._size = size
             self._dtype = dtype
             self._device = device

         def forward(self, a, b):
             return np.random.beta(a, b, size=self._size, dtype=self._dtype, device=self._device)

     def _test_random_beta_range(output):
         bigger_than_zero = onp.all(output > 0)
         smaller_than_one = onp.all(output < 1)
         return bigger_than_zero and smaller_than_one

     # Starting with numpy 1.19.0, output shape of () is no longer supported
     shape_list = [(0,), (1,), (2, 3), (4, 0, 5), 6, (7, 8), None]
     # since fp16 might incur precision issue, the corresponding test is skipped
     dtype_list = [np.float32, np.float64]
     hybridize_list = [False, True]
     data = np.array([1])
     for [param_shape, in_dtype, out_dtype, hybridize] in itertools.product(shape_list,
             dtype_list, dtype_list, hybridize_list):
         mx_data = data.astype(in_dtype)
         np_data = mx_data.asnumpy()
         test_random_beta = TestRandomBeta(size=param_shape, dtype=out_dtype)
         if hybridize:
             test_random_beta.hybridize()
         np_out = onp.random.beta(np_data, np_data, size=param_shape)
         mx_out = test_random_beta(mx_data, mx_data)
         mx_out_imperative = mx.np.random.beta(mx_data, mx_data, size=param_shape, dtype=out_dtype)

         assert np_out.shape == mx_out.shape
         assert np_out.shape == mx_out_imperative.shape
         assert _test_random_beta_range(mx_out.asnumpy()) == True
         assert _test_random_beta_range(mx_out_imperative.asnumpy()) == True

         # test scalar
         mx_out_imperative = mx.np.random.beta(1, 1, size=param_shape, dtype=out_dtype)
         assert _test_random_beta_range(mx_out_imperative.asnumpy()) == True


 @use_np
 def test_np_random_f():
     class TestRandomF(HybridBlock):
         def __init__(self, size=None):
             super(TestRandomF, self).__init__()
             self._size = size

         def forward(self, dfnum, dfden):
             return np.random.f(dfnum, dfden, size=self._size)

     # Starting with numpy 1.19.0, output shape of () is no longer supported
     shape_list = [(0,), (1,), (2, 3), (4, 0, 5), 6, (7, 8), None]
     hybridize_list = [False, True]
     df = np.array([1])
     for [param_shape, hybridize] in itertools.product(shape_list,
          hybridize_list):
         if sys.version_info.major < 3 and param_shape == ():
             continue
         mx_df = df
         np_df = mx_df.asnumpy()
         test_random_f = TestRandomF(size=param_shape)
         if hybridize:
             test_random_f.hybridize()
         np_out = onp.random.f(np_df, np_df, size=param_shape)
         mx_out = test_random_f(mx_df, mx_df)
         mx_out_imperative = mx.np.random.f(mx_df, mx_df, size=param_shape)

         assert np_out.shape == mx_out.shape
         assert np_out.shape == mx_out_imperative.shape


 @use_np
 def test_np_random_chisquare():
     class TestRandomChisquare(HybridBlock):
         def __init__(self, size=None, dtype=None, device=None):
             super(TestRandomChisquare, self).__init__()
             self._size = size
             self._dtype = dtype
             self._device = device

         def forward(self, df):
             return np.random.chisquare(df, size=self._size, dtype=self._dtype, device=self._device)

     # Starting with numpy 1.19.0, output shape of () is no longer supported
     shape_list = [(0,), (1,), (2, 3), (4, 0, 5), 6, (7, 8), None]

     dtype_list = [np.float16, np.float32, np.float64]
     hybridize_list = [False, True]
     df = np.array([1])
     for [param_shape, in_dtype, out_dtype, hybridize] in itertools.product(shape_list,
             dtype_list, dtype_list, hybridize_list):
         if sys.version_info.major < 3 and param_shape == ():
             continue
         mx_df = df.astype(in_dtype)
         np_df = mx_df.asnumpy()
         test_random_chisquare = TestRandomChisquare(size=param_shape, dtype=out_dtype)
         if hybridize:
             test_random_chisquare.hybridize()
         np_out = onp.random.chisquare(np_df, size=param_shape)
         mx_out = test_random_chisquare(mx_df)
         mx_out_imperative = mx.np.random.chisquare(mx_df, size=param_shape, dtype=out_dtype)

         assert np_out.shape == mx_out.shape
         assert np_out.shape == mx_out_imperative.shape


 @use_np
 def test_np_random_rayleigh():
     class TestRayleigh(HybridBlock):
         def __init__(self, shape):
             super(TestRayleigh, self).__init__()
             self._shape = shape

         def forward(self, scale):
             return np.random.rayleigh(scale, self._shape)

     shapes = [(2, 3), (4, 0, 5), (7, 8)]
     for hybridize in [False, True]:
         for shape in shapes:
             test_rayleigh = TestRayleigh(shape)
             if hybridize:
                 test_rayleigh.hybridize()

             scale = np.ones(shape)
             scale.attach_grad()
             with mx.autograd.record():
                 mx_out = test_rayleigh(scale)
             np_out = onp.random.rayleigh(scale = scale.asnumpy(), size = shape)
             assert np_out.shape == mx_out.shape
             mx_out.backward()
             assert scale.grad.shape == shape
             assert_almost_equal(scale.grad.asnumpy().sum(), mx_out.asnumpy().sum(), rtol=1e-3, atol=1e-5)

     for shape in shapes:
         mx_out = np.random.rayleigh(np.array([1]), shape)
         np_out = onp.random.rayleigh(np.array([1]).asnumpy(), shape)
         assert mx_out.asnumpy().shape == np_out.shape

     def _test_rayleigh_exception(scale):
         output = np.random.rayleigh(scale=scale).asnumpy()
     assertRaises(ValueError, _test_rayleigh_exception, -1)


 @use_np
 def test_np_exponential():
     class TestRandomExp(HybridBlock):
         def __init__(self, shape):
             super(TestRandomExp, self).__init__()
             self._shape = shape

         def forward(self, scale):
             return np.random.exponential(scale, self._shape)

     output_shapes = [
         (3, 2),
         (4, 3, 2, 2),
         (3, 4, 5)
     ]
     for hybridize in [False, True]:
         for out_shape in output_shapes:
             test_exponential_grad = TestRandomExp(out_shape)
             if hybridize:
                 test_exponential_grad.hybridize()
             scale = np.ones(out_shape)
             scale.attach_grad()
             with mx.autograd.record():
                 mx_out = test_exponential_grad(scale)
             np_out = onp.random.exponential(scale = scale.asnumpy(), size = out_shape)
             assert np_out.shape == mx_out.shape
             mx_out.backward()
             assert scale.grad.shape == out_shape
             assert_almost_equal(scale.grad.asnumpy().sum(), mx_out.asnumpy().sum(), rtol=1e-3, atol=1e-5)

     def _test_exponential_exception(scale):
         output = np.random.exponential(scale=scale).asnumpy()
     assertRaises(ValueError, _test_exponential_exception, -1)


 @use_np
 def test_np_random_a():
     op_names = ['pareto', 'power', 'weibull']
     # these distributions have one required parameter a
     shapes = [(1,), (2, 3), (4, 0, 5), 6, (7, 8), (), None]

     def _test_random_x_range(output):
         ge_zero = onp.all(output >= 0)
         smaller_equal_one = onp.all(output <= 1)
         return ge_zero and smaller_equal_one

     # test imperative size shapes
     for [shape, op_name] in itertools.product(shapes, op_names):
         op = getattr(np.random, op_name, None)
         assert op is not None
         out = op(1.0, size=shape)
         expected_shape = shape
         if not isinstance(shape, tuple):
             expected_shape = () if shape is None else (shape,)
         assert out.shape == expected_shape
         # test range of generated values for power distribution
         if op_name == 'power':
             assert _test_random_x_range(out.asnumpy()) == True

     # test symbolic/hybridized size shapes
     class TestRandomA(HybridBlock):
         def __init__(self, shape, op_name):
             super(TestRandomA, self).__init__()
             self._shape = shape
             self._op_name = op_name

         def forward(self, a):
             op = getattr(np.random, self._op_name, None)
             assert op is not None
             return op(a, size=self._shape)

     hybridize = [False, True]
     for [op_name, shape, hybridize] in itertools.product(op_names, shapes, hybridize):
         test_op = TestRandomA(shape, op_name)
         if hybridize:
             test_op.hybridize()
         mx_out = test_op(np.array(1.0))
         expected_shape = shape
         if not isinstance(shape, tuple):
             expected_shape = () if shape is None else (shape,)
         assert mx_out.shape == expected_shape

     # test broadcasting of required parameter a shape when a is array-like
     ashapes = [(1,), (2, 3), (4, 0, 5), 6, (7, 8)]
     for shape in ashapes:
         a = np.ones(shape)
         for op_name in op_names:
             op = getattr(np.random, op_name, None)
             assert op is not None
             mx_out = op(a, size=None)
             expected_shape = a.shape
             assert mx_out.shape == expected_shape

     # test illegal parameter values
     def _test_exception(a):
         output = op(a=a).asnumpy()
     for op in op_names:
         op = getattr(np.random, op_name, None)
         if op is not None:
             assertRaises(ValueError, _test_exception, -1)
             assertRaises(ValueError, _test_exception, 0)


 @use_np
 def test_np_weibull_grad():
     class TestRandomW(HybridBlock):
         def __init__(self, shape):
             super(TestRandomW, self).__init__()
             self._shape = shape

         def forward(self, a):
             return np.random.weibull(a, self._shape)

     output_shapes = [
         (3, 2),
         (4, 3, 2, 2),
         (3, 4, 5)
     ]
     for hybridize in [False, True]:
         for out_shape in output_shapes:
             test_w_grad = TestRandomW(out_shape)
             if hybridize:
                 test_w_grad.hybridize()
             a = np.ones(out_shape)
             a.attach_grad()
             with mx.autograd.record():
                 mx_out = test_w_grad(a)
             mx_out.backward()

             # gradient formula calculus (a=1)
             formula_grad = - mx_out * np.log(mx_out)
             assert a.grad.shape == out_shape
             assert_almost_equal(a.grad.asnumpy().sum(), formula_grad.asnumpy().sum(), rtol=1e-3, atol=1e-5)


 @use_np
 def test_np_pareto_grad():
     class TestRandomP(HybridBlock):
         def __init__(self, shape):
             super(TestRandomP, self).__init__()
             self._shape = shape

         def forward(self, a):
             return np.random.pareto(a, self._shape)

     output_shapes = [
         (3, 2),
         (4, 3, 2, 2),
         (3, 4, 5)
     ]
     for hybridize in [False, True]:
         for out_shape in output_shapes:
             test_w_grad = TestRandomP(out_shape)
             if hybridize:
                 test_w_grad.hybridize()
             a = np.ones(out_shape)
             a.attach_grad()
             with mx.autograd.record():
                 mx_out = test_w_grad(a)
             mx_out.backward()

             # gradient formula from calculus (a=1)
             noise = np.log(mx_out + np.ones(mx_out.shape))
             formula_grad = - (mx_out + np.ones(mx_out.shape)) * noise
             assert a.grad.shape == out_shape
             assert_almost_equal(a.grad.asnumpy().sum(), formula_grad.asnumpy().sum(), rtol=1e-3, atol=1e-5)


 @use_np
 def test_np_randn():
     # Test shapes.
     shapes = [
         (3, 3),
         (3, 4),
         (0, 0),
         (3, 3, 3),
         (0, 0, 0),
         (2, 2, 4, 3),
         (2, 2, 4, 3),
         (2, 0, 3, 0),
         (2, 0, 2, 3)
     ]
     dtypes = ['float16', 'float32', 'float64']
     for dtype in dtypes:
         for shape in shapes:
             data_mx = np.random.randn(*shape, dtype=dtype)
             assert data_mx.shape == shape


 @use_np
 @pytest.mark.skip(reason='Test hangs. Tracked in #18144')
 def test_np_multivariate_normal():
     class TestMultivariateNormal(HybridBlock):
         def __init__(self, size=None):
             super(TestMultivariateNormal, self).__init__()
             self.size = size

         def forward(self, mean, cov):
             return np.random.multivariate_normal(mean, cov, self.size)

     hybridize_list = [True, False]
     dtypes = ['float16', 'float32', 'float64']
     size_list = [None, 1, (), (2, 3), (2, 0)]
     # [mean_shape, cov_shape]: onp.broadcast(mean_shape, cov_shape[:-1]) should not raise error
     batch_shape_list = [[(2,), (2, 2)], [(3, 2), (2, 2)], [(2,), (3, 2, 2)], [(3, 2), (4, 3, 2, 2)]]
     # most basic case for mean and cov
     mean = np.array([0.123456789, 10])
     cov = np.array([[1, 0], [0, 10]])

     for [hybridize, dtype, size, batch_shape] in itertools.product(hybridize_list,\
                 dtypes, size_list, batch_shape_list):
         # simplest case: 1-d, 0 batch
         # compared with official numpy
         mean_shape = batch_shape[0]
         cov_shape = batch_shape[1]
         new_mean = np.broadcast_to(mean, mean_shape).astype(dtype)
         new_cov = np.broadcast_to(cov, cov_shape).astype(dtype)

         test_multivariate_normal = TestMultivariateNormal(size)
         if hybridize:
             test_multivariate_normal.hybridize()

         test_shape = test_multivariate_normal(new_mean, new_cov).shape
         actual_shape = np.random.multivariate_normal(new_mean, new_cov, size).shape

         desired_shape = np.broadcast_arrays(np.empty(mean_shape), np.empty(cov_shape[:-1]))[0].shape

         if size is not None:
             size = [size] if isinstance(size, int) else list(size)
             desired_shape = size + list(desired_shape)

         assert list(desired_shape) == list(test_shape)
         assert list(desired_shape) == list(actual_shape)


 @use_np
 def test_npx_categorical():
     class TestNumpyCategorical(HybridBlock):
         def __init__(self, size=None):
             super(TestNumpyCategorical, self).__init__()
             self.size = size

         def forward(self, prob):
             if self.size is None:
                 return npx.random.categorical(prob)
             return npx.random.categorical(prob, shape=self.size)

     batch_sizes = [(2,), (2, 3)]
     event_shapes = [None, (10,), (10, 12)]
     num_event = [2, 4, 10]
     for batch_size, num_event, event_shape in itertools.product(batch_sizes, num_event, event_shapes):
         for hybridize in [True, False]:
             prob = np.ones(batch_size + (num_event,)) / num_event
             net = TestNumpyCategorical(event_shape)
             if hybridize:
                 net.hybridize()
             mx_out = net(prob)
             desired_shape = batch_size + event_shape if event_shape is not None else batch_size
             assert mx_out.shape == desired_shape


 @use_np
 def test_npx_multinomial():
     class TestNumpyMultinomial(HybridBlock):
         def __init__(self, size=None):
             super(TestNumpyMultinomial, self).__init__()
             self.size = size

         def forward(self, n, prob):
             if self.size is None:
                 return npx.random.multinomial(n, prob)
             return npx.random.multinomial(n, prob, shape=self.size)

     batch_sizes = [(2,), (2, 3)]
     event_shapes = [None, (10,), (10, 12)]
     num_event = [2, 4, 10]
     for batch_size, num_event, event_shape in itertools.product(batch_sizes, num_event, event_shapes):
         for hybridize in [True, False]:
             n = np.ones(batch_size)
             prob = np.ones(batch_size + (num_event,)) / num_event
             net = TestNumpyMultinomial(event_shape)
             if hybridize:
                 net.hybridize()
             mx_out = net(n, prob)
             desired_shape = batch_size + event_shape + (num_event,) if event_shape is not None else batch_size + (num_event,)
             assert mx_out.shape == desired_shape


 @use_np
 def test_random_seed():
     for seed in [234, 594, 7240, 20394]:
         ret = []
         for _ in range(2):
             npx.random.seed(seed=seed)
             ret.append(np.random.uniform(size=(2, 3)))
         assert_almost_equal(ret[0].asnumpy(), ret[1].asnumpy(), rtol=1e-4, atol=1e-5, use_broadcast=False)


 @use_np
 def test_np_cumsum():
     def np_cumsum_backward(ograd, axis=None, dtype=None):
         return onp.flip(onp.cumsum(onp.flip(ograd, axis=axis), axis=axis, dtype=dtype), axis=axis)

     class TestCumsum(HybridBlock):
         def __init__(self, axis=None, dtype=None):
             super(TestCumsum, self).__init__()
             self._axis = axis
             self._dtype = dtype

         def forward(self, a):
             return a.cumsum(axis=self._axis, dtype=self._dtype)

     shapes = [(2, 3, 4), (2, 0, 3), ()]
     for hybridize in [True, False]:
         for shape in shapes:
             for axis in [None] + [i for i in range(0, len(shape))]:
                 for otype in [None, onp.float32, onp.float64]:
                     test_cumsum = TestCumsum(axis=axis, dtype=otype)
                     if hybridize:
                         test_cumsum.hybridize()
                     for itype in [onp.float16, onp.float32, onp.float64]:
                         x = rand_ndarray(shape).astype(itype).as_np_ndarray()
                         x.attach_grad()
                         np_out = onp.cumsum(x.asnumpy(), axis=axis, dtype=otype)
                         with mx.autograd.record():
                             mx_out = test_cumsum(x)
                         assert mx_out.shape == np_out.shape
                         assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)
                         mx_out.backward()
                         np_backward = np_cumsum_backward(onp.ones(np_out.shape, dtype=otype),
                                                          axis=axis, dtype=otype).reshape(x.shape)
                         assert_almost_equal(x.grad.asnumpy(), np_backward, rtol=1e-3, atol=1e-5)

                         mx_out = np.cumsum(x, axis=axis, dtype=otype)
                         np_out = onp.cumsum(x.asnumpy(), axis=axis, dtype=otype)
                         assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)

     for shape in shapes:
         for axis in [None] + [i for i in range(0, len(shape))]:
             for otype in [None, onp.int32, onp.int64]:
                 for itype in [onp.bool, onp.int8, onp.int32, onp.int64]:
                     x = rand_ndarray(shape).astype(itype).as_np_ndarray()
                     np_out = onp.cumsum(x.asnumpy(), axis=axis, dtype=otype)
                     mx_out = np.cumsum(x, axis=axis, dtype=otype)
                     assert mx_out.shape == np_out.shape
                     assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)


 @use_np
 @pytest.mark.skip(reason='Skipped as the test is flaky and the feature causes curand error. Tracked in #18100')
 def test_np_histogram():
     shapes = [(), (3, 4), (3, 0)]

     for shape in shapes:
         mx_a = np.random.uniform(0.0, 10.0, size=shape)
         np_a = mx_a.asnumpy()
         mx_bins = np.array([0.0, 1.0, 2.0, 3.0, 4.0, 5., 6., 7., 8., 9., 10.])
         np_bins = mx_bins.asnumpy()
         for bins, _range in [(20, (0.0, 10.0)), (mx_bins, None)]:
             mx_cnts, mx_bins = np.histogram(mx_a, bins=bins, range=_range)
             np_cnts, np_bins = onp.histogram(np_a, bins=bins if isinstance(bins, mx.base.numeric_types) else bins.asnumpy(), range=_range)
             assert_almost_equal(mx_cnts.asnumpy(), np_cnts, rtol=1e-3, atol=1e-5)
             assert_almost_equal(mx_bins.asnumpy(), np_bins, rtol=1e-3, atol=1e-5)


 @use_np
 @pytest.mark.skip(reason='Skipped as the test is flaky and the feature causes curand error. Tracked in #18100')
 def test_np_choice():
     class TestUniformChoice(HybridBlock):
         def __init__(self, sample_size, replace):
             super(TestUniformChoice, self).__init__()
             self.sample_size = sample_size
             self.replace = replace

         def forward(self, a):
             return np.random.choice(a=a, size=self.sample_size, replace=self.replace, p=None)

     class TestWeightedChoice(HybridBlock):
         def __init__(self, sample_size, replace):
             super(TestWeightedChoice, self).__init__()
             self.sample_size = sample_size
             self.replace = replace

         def forward(self, a, p):
             op = getattr(np.random, "choice", None)
             return np.random.choice(a, self.sample_size, self.replace, p)

     def test_sample_with_replacement(sampler, num_classes, shape, weight=None):
         samples = sampler(num_classes, shape, replace=True, p=weight).asnumpy()
         generated_density = onp.histogram(samples, onp.arange(num_classes + 1), density=True)[0]
         expected_density = (weight.asnumpy() if weight is not None else
                             onp.array([1 / num_classes] * num_classes))
         # test almost equal
         assert_almost_equal(generated_density, expected_density, rtol=1e-1, atol=1e-1)
         # test shape
         assert (samples.shape == shape)

     def test_sample_without_replacement(sampler, num_classes, shape, num_trials, weight=None):
         samples = sampler(num_classes, shape, replace=False, p=weight).asnumpy()
         # Check shape and uniqueness
         assert samples.shape == shape
         assert len(onp.unique(samples)) == samples.size
         # Check distribution
         bins = onp.zeros((num_classes))
         expected_freq = (weight.asnumpy() if weight is not None else
                          onp.array([1 / num_classes] * num_classes))
         for _ in range(num_trials):
             out = sampler(num_classes, 1, replace=False, p=weight).item()
             bins[out] += 1
         bins /= num_trials
         assert_almost_equal(bins, expected_freq, rtol=1e-1, atol=1e-1)

     def test_indexing_mode(sampler, set_size, samples_size, replace, weight=None):
         a = np.arange(set_size)
         if weight is not None:
             samples = sampler(a, weight)
         else:
             samples = sampler(a)
         assert len(samples) == samples_size
         if not replace:
             assert len(onp.unique(samples.asnumpy())) == samples_size

     num_classes = 10
     num_samples = 10 ** 8
     # Density tests are commented out due to their huge time comsumption.
     # Tests passed locally.
     # shape_list1 = [
     #     (10 ** 8, 1),
     #     (10 ** 5, 10 ** 3),
     #     (10 ** 2, 10 ** 3, 10 ** 3)
     # ]
     # for shape in shape_list1:
     #     test_sample_with_replacement(np.random.choice, num_classes, shape)
     #     weight = np.array(onp.random.dirichlet([1.0] * num_classes))
     #     test_sample_with_replacement(np.random.choice, num_classes, shape, weight)

     # Tests passed locally,
     # commented out for the same reason as above.
     # shape_list2 = [
     #     (6, 1),
     #     (2, 3),
     #     (1, 2, 3),
     #     (2, 2),
     # ]
     # for shape in shape_list2:
     #     test_sample_without_replacement(np.random.choice, num_classes, shape, 10 ** 5)
     #     weight = np.array(onp.random.dirichlet([1.0] * num_classes))
     #     test_sample_without_replacement(np.random.choice, num_classes, shape, 10 ** 5, weight)

     # Test hypridize mode:
     for wtype in ['float16', 'float32', 'float64']:
         for hybridize in [True, False]:
             for replace in [True, False]:
                 test_choice = TestUniformChoice(num_classes // 2, replace)
                 test_choice_weighted = TestWeightedChoice(num_classes // 2, replace)
                 if hybridize:
                     test_choice.hybridize()
                     test_choice_weighted.hybridize()
                 weight = np.array(onp.random.dirichlet([1.0] * num_classes)).astype(wtype)
                 test_indexing_mode(test_choice, num_classes, num_classes // 2, replace, None)
                 test_indexing_mode(test_choice_weighted, num_classes, num_classes // 2, replace, weight)


 @use_np
 def test_np_eye():
     configs = [
         4,
         1000,
         (4, 3),
         (5, None),
         (4, None, 1),
         (2, 2, 1),
         (4, 6, 1),
         (7, 3, -3),
         (3, 2, -2),
         (4, 0),
         (0, 0),
         (0, 3),
         (0, 0, -2)
     ]
     exception_configs = [
         -1,
         -1000,
         (-2, None),
         (1, -1)
     ]
     dtypes = ['int32', 'float16', 'float32', 'float64', None]
     for config in configs:
         for dtype in dtypes:
             if isinstance(config, tuple):
                 mx_ret = np.eye(*config, dtype=dtype)
                 np_ret = onp.eye(*config, dtype=dtype)
             else:
                 mx_ret = np.eye(config, dtype=dtype)
                 np_ret = onp.eye(config, dtype=dtype)
             assert same(mx_ret.asnumpy(), np_ret)
     # check for exception input
     for config in exception_configs:
         if isinstance(config, tuple):
             assertRaises(MXNetError, np.eye, *config)
         else:
             assertRaises(MXNetError, np.eye, config)

     class TestEye(HybridBlock):
         def __init__(self, N, M=None, k=0, dtype=None):
             super(TestEye, self).__init__()
             self._N = N
             self._M = M
             self._k = k
             self._dtype = dtype

         def forward(self, x):
             return x + np.eye(self._N, self._M, self._k, dtype=self._dtype)

     for dtype in dtypes:
         x = np.zeros(shape=(), dtype=dtype)
         for config in configs:
             for hybridize in [False, True]:
                 if isinstance(config, tuple):
                     net = TestEye(*config, dtype=dtype)
                     np_out = onp.eye(*config, dtype=dtype)
                 else:
                     net = TestEye(config, dtype=dtype)
                     np_out = onp.eye(config, dtype=dtype)
                 if hybridize:
                     net.hybridize()
                 mx_out = net(x)
                 assert same(mx_out.asnumpy(), np_out)


 @use_np
 def test_np_indices():
     dtypes = ['int32', 'int64', 'float16', 'float32', 'float64']
     shapes = [
         (0,),
         (3,),
         (2, 3, 4),
         (2, 0, 4),
         (1, 1, 1, 1),
         (1, 0, 0, 1),
         (2, 3, 4, 5, 6, 7)
     ]
     if platform.system() == 'Windows':
         shapes = shapes[1:]  # beacuse in numpy windows version, indces not support dimensions is empty tuple.
     for dtype in dtypes:
         for shape in shapes:
             np_out = onp.indices(dimensions=shape, dtype=dtype)
             mx_out = np.indices(dimensions=shape, dtype=dtype)
             assert same(mx_out.asnumpy(), np_out)
             assert mx_out.shape == np_out.shape

     @use_np
     class TestIndices(HybridBlock):
         def __init__(self, dimensions=None, dtype=None):
             super(TestIndices, self).__init__()
             self._dimensions = dimensions
             self._dtype = dtype

         def forward(self, x):
             return x + np.indices(dimensions=self._dimensions, dtype=self._dtype)

     for dtype in dtypes:
         for shape in shapes:
             x = np.zeros(shape=(), dtype=dtype)
             for hybridize in [False, True]:
                 net = TestIndices(dimensions=shape, dtype=dtype)
                 np_out = onp.indices(dimensions=shape, dtype=dtype)
                 if hybridize:
                     net.hybridize()
                 mx_out = net(x)
                 assert same(mx_out.asnumpy(), np_out)
                 assert mx_out.shape == np_out.shape


 @use_np
 def test_np_repeat():
     config = [
         ((), 2, None),
         ((), 0, None),
         ((4, 2), 2, None),
         ((4, 2), 2, 0),
         ((4, 2), 2, 1),
         ((4, 2), 2, -1),
         ((4, 2), [2,3] * 4, None),
         ((4, 2), [1,2], 1),
     ]

     class TestRepeat(HybridBlock):
         def __init__(self, repeats, axis=None):
             super(TestRepeat, self).__init__()
             self._repeats = repeats
             self._axis = axis

         def forward(self, x):
             return x.repeat(self._repeats, self._axis)

     for shape, repeats, axis in config:
         data_np = onp.random.randint(low=0, high=1000, size=shape)
         data_mx = np.array(data_np, dtype=data_np.dtype)
         ret_np = data_np.repeat(repeats, axis)
         ret_mx = data_mx.repeat(repeats, axis)
         assert same(ret_mx.asnumpy(), ret_np)

         net = TestRepeat(repeats, axis)
         for hybrid in [False, True]:
             if hybrid:
                 net.hybridize()
             ret_mx = net(data_mx)
             assert same(ret_mx.asnumpy(), ret_np)


 @use_np
 def test_np_linalg_norm():
     class TestLinalgNorm(HybridBlock):
         def __init__(self, ord=None, axis=None, keepdims=False):
             super(TestLinalgNorm, self).__init__()
             self._ord = ord
             self._axis = axis
             self._keepdims = keepdims

         def forward(self, x):
             return np.linalg.norm(x, ord=self._ord, axis=self._axis, keepdims=self._keepdims)

     configs = [
         ((2, 3, 4), 1, (2, 1)),
         ((2, 3, 4), 2, (1, 2)),
         ((2, 3, 4), None, None),
         ((3,), None, None),
         ((2, 3), 2, 1),
         ((2, 3, 4), 1, 1),
         ((2, 3, 4), -1, 2),
         ((2, 3, 4), 2, 1),
         ((2, 3, 4), 4, 1),
         ((2, 3, 0, 4), -2, 1),
         ((2, 3, 4, 5), 2, (2, 3)),
         ((2, 3), -1, None),
         ((2, 3, 4), 'inf', 1),
         ((2, 3, 4), '-inf', (1, 0)),
         ((2, 3), None, (0, 1)),
         ((3, 2, 3), None, (1, 2)),
         ((2, 3), None, None),
         ((2, 3, 4), 'fro', (0, 2)),
         ((2, 0, 4), 'fro', (0, 2)),
         ((2, 3, 4), None, (0, 2)),
         ((2, 3, 4), -3.2, 2),
         ((2, 3, 4), -1, (0, 1)),
         ((2, 3, 4), 'inf', (0, 2)),
         ((2, 3, 4), '-inf', (0, 2)),
         ((4, 4, 4, 4), -2, (0, 2)),
         ((2, 3, 4), 'nuc', (0, 2)),
         ((2, 2), 'nuc', None),
     ]

     def spectral_norm_grad(data):
         with mx.autograd.record():
             UT, S, V = np.linalg.svd(data)
             norm = np.max(np.abs(S), axis=-1)
         norm.backward()
         return data.grad.asnumpy()

     # numpy is flaky under float16, also gesvd does not support fp16
     dtypes = [np.float32, np.float64]
     for hybridize, itype, (shape, ord, axis), keepdims in \
         itertools.product([True, False], dtypes, configs, [True, False]):
         net = TestLinalgNorm(ord, axis, keepdims)
         rtol = 1e-2
         atol = 1e-2
         if hybridize:
             net.hybridize()
         a = mx.nd.random.uniform(-10.0, 10.0, shape=shape, dtype=itype).as_np_ndarray()
         a.attach_grad()
         with mx.autograd.record():
             mx_ret = net(a)
         if ord == 'inf':
             np_ret = onp.linalg.norm(a.asnumpy(), ord=onp.inf, axis=axis, keepdims=keepdims)
         elif ord == '-inf':
             np_ret = onp.linalg.norm(a.asnumpy(), ord=-onp.inf, axis=axis, keepdims=keepdims)
         else:
             np_ret = onp.linalg.norm(a.asnumpy(), ord=ord, axis=axis, keepdims=keepdims)

         assert np_ret.shape == mx_ret.shape
         assert_almost_equal(mx_ret.asnumpy(), np_ret, rtol=rtol, atol=atol)

         mx_ret.backward()

         grad_axis = axis
         if axis is None and len(shape) >= 2 and ord is not None:
             grad_axis = (len(shape) - 2, len(shape) - 1)
         elif axis is None and ord is None:
             grad_axis = tuple([i for i in range(len(shape))])
         elif axis is None:
             grad_axis = len(shape) - 1

         if not keepdims and isinstance(grad_axis, tuple):
             if len(grad_axis) == 2 and grad_axis[0] > grad_axis[1] and grad_axis[0] > len(np_ret.shape):
                 grad_axis = (grad_axis[1], grad_axis[0])
             for i in grad_axis:
                 np_ret = onp.expand_dims(np_ret, axis=i)
         elif not keepdims:
             np_ret = onp.expand_dims(np_ret, axis=grad_axis)

         if ord == 4:
             backward_expected = onp.sign(a.asnumpy()) * onp.power(onp.abs(a.asnumpy()) / np_ret, ord - 1)
             assert_almost_equal(a.grad.asnumpy(), backward_expected, rtol=rtol, atol=atol)

         if ord == 2 and not isinstance(grad_axis, tuple):
             backward_expected = onp.divide(a.asnumpy(), np_ret)
             assert_almost_equal(a.grad.asnumpy(), backward_expected, rtol=rtol, atol=atol)
         elif ord == 2 and isinstance(grad_axis, tuple):
             backward_expected = spectral_norm_grad(a)
             assert_almost_equal(a.grad.asnumpy(), backward_expected, rtol=rtol, atol=atol)

         if ord == 'fro':
             backward_expected = onp.divide(a.asnumpy(), np_ret)
             assert_almost_equal(a.grad.asnumpy(), backward_expected, rtol=rtol, atol=atol)

         assert a.grad.shape == a.shape

         # Test imperative once again
         if ord == 'inf':
             np_ret = onp.linalg.norm(a.asnumpy(), ord=onp.inf, axis=axis, keepdims=keepdims)
         elif ord == '-inf':
             np_ret = onp.linalg.norm(a.asnumpy(), ord=-onp.inf, axis=axis, keepdims=keepdims)
         else:
             np_ret = onp.linalg.norm(a.asnumpy(), ord=ord, axis=axis, keepdims=keepdims)
         mx_ret = np.linalg.norm(a, ord=ord, axis=axis, keepdims=keepdims)
         assert_almost_equal(mx_ret.asnumpy(), np_ret, rtol=rtol, atol=atol)


 @use_np
 @pytest.mark.parametrize('shape,ord,axis', [
     ((2, 3, 4), 2, (1, 2)),
     ((2, 3, 4), None, None),
     ((3,), None, None),
     ((2, 3), 2, 1),
     ((2, 3, 4), 1, 1),
     ((2, 3, 4), -1, 2),
     ((2, 3, 4), 2, 1),
     ((2, 3, 4), 4, 1),
     ((2, 3, 0, 4), -2, 1),
     ((2, 3, 4, 5), 2, (2, 3)),
     ((2, 3, 4), 'inf', 1),
     ((2, 3, 4), '-inf', (1, 0)),
     ((2, 3), None, (0, 1)),
     ((3, 2, 3), None, (1, 2)),
     ((2, 3), None, None),
     ((2, 3, 4), None, (0, 2)),
     ((2, 3, 4), -3.2, 2),
     ((2, 3, 4), 'inf', (0, 2)),
     ((2, 3, 4), '-inf', (0, 2)),
     ((2, 3, 4, 5, 7), 2, (2, 3, 1)),
 ])
 @pytest.mark.parametrize('hybridize', [True, False])
 @pytest.mark.parametrize('itype', [np.float32, np.float64])
 @pytest.mark.parametrize('keepdims', [True, False])
 def test_np_linalg_vector_norm(shape, ord, axis, hybridize, itype, keepdims):
     class TestLinalgVectNorm(HybridBlock):
         def __init__(self, ord=None, axis=None, keepdims=False):
             super(TestLinalgVectNorm, self).__init__()
             self._ord = ord
             self._axis = axis
             self._keepdims = keepdims

         def forward(self, x):
             return np.linalg.vector_norm(x, ord=self._ord, axis=self._axis, keepdims=self._keepdims)

     def spectral_norm_grad(data):
         with mx.autograd.record():
             UT, S, V = np.linalg.svd(data)
             norm = np.max(np.abs(S), axis=-1)
         norm.backward()
         return data.grad.asnumpy()

     def onp_vector_norm(a, axis=None, keepdims=False, ord=2):
         if axis is None:
             a = a.flatten()
             axis = 0
         elif isinstance(axis, tuple):
             # Note: The axis argument supports any number of axes, whereas norm()
             # only supports a single axis for vector norm.
             rest = tuple(i for i in range(a.ndim) if i not in axis)
             newshape = axis + rest
             a = onp.transpose(a, newshape).reshape((reduce(lambda x, y: x * y, [a.shape[x] for x in axis]), *[a.shape[i] for i in rest]))
             axis = 0
         return onp.linalg.norm(a, axis=axis, keepdims=keepdims, ord=ord)

     # numpy is flaky under float16, also gesvd does not support fp16
     net = TestLinalgVectNorm(ord, axis, keepdims)
     rtol = 1e-2
     atol = 1e-2
     if hybridize:
         net.hybridize()
     a = mx.np.random.uniform(-10.0, 10.0, size=shape, dtype=itype)
     a.attach_grad()
     with mx.autograd.record():
         mx_ret = net(a)
     if ord == 'inf':
         np_ret = onp_vector_norm(a.asnumpy(), ord=onp.inf, axis=axis, keepdims=keepdims)
     elif ord == '-inf':
         np_ret = onp_vector_norm(a.asnumpy(), ord=-onp.inf, axis=axis, keepdims=keepdims)
     else:
         np_ret = onp_vector_norm(a.asnumpy(), ord=ord, axis=axis, keepdims=keepdims)

     assert np_ret.shape == mx_ret.shape
     assert_almost_equal(mx_ret.asnumpy(), np_ret, rtol=rtol, atol=atol)

     mx_ret.backward()

     grad_axis = axis
     if axis is None and len(shape) >= 2 and ord is not None:
         grad_axis = (len(shape) - 2, len(shape) - 1)
     elif axis is None and ord is None:
         grad_axis = tuple([i for i in range(len(shape))])
     elif axis is None:
         grad_axis = len(shape) - 1

     if not keepdims and isinstance(grad_axis, tuple):
         if len(grad_axis) == 2 and grad_axis[0] > grad_axis[1] and grad_axis[0] > len(np_ret.shape):
             grad_axis = (grad_axis[1], grad_axis[0])
         for i in grad_axis:
             np_ret = onp.expand_dims(np_ret, axis=i)
     elif not keepdims:
         np_ret = onp.expand_dims(np_ret, axis=grad_axis)

     if ord == 4:
         backward_expected = onp.sign(a.asnumpy()) * onp.power(onp.abs(a.asnumpy()) / np_ret, ord - 1)
         assert_almost_equal(a.grad.asnumpy(), backward_expected, rtol=rtol, atol=atol)

     if ord == 2 and not isinstance(grad_axis, tuple):
         backward_expected = onp.divide(a.asnumpy(), np_ret)
         assert_almost_equal(a.grad.asnumpy(), backward_expected, rtol=rtol, atol=atol)
     elif ord == 2 and isinstance(grad_axis, tuple):
         backward_expected = spectral_norm_grad(a)
         assert_almost_equal(a.grad.asnumpy(), backward_expected, rtol=rtol, atol=atol)

     assert a.grad.shape == a.shape

     # Test imperative once again
     if ord == 'inf':
         np_ret = onp_vector_norm(a.asnumpy(), ord=onp.inf, axis=axis, keepdims=keepdims)
     elif ord == '-inf':
         np_ret = onp_vector_norm(a.asnumpy(), ord=-onp.inf, axis=axis, keepdims=keepdims)
     else:
         np_ret = onp_vector_norm(a.asnumpy(), ord=ord, axis=axis, keepdims=keepdims)
     mx_ret = np.linalg.vector_norm(a, ord=ord, axis=axis, keepdims=keepdims)
     assert_almost_equal(mx_ret.asnumpy(), np_ret, rtol=rtol, atol=atol)


 @use_np
 @pytest.mark.parametrize('shape,ord,axis', [
     ((2, 3, 4), 1, (2, 1)),
     ((2, 3, 4), 2, (1, 2)),
     ((2, 3, 4), None, None),
     ((3,), None, None),
     ((2, 3), 2, 1),
     ((2, 3, 4), 1, 1),
     ((2, 3, 4), -1, 2),
     ((2, 3, 4), 2, 1),
     ((2, 3, 4), 4, 1),
     ((2, 3, 0, 4), -2, 1),
     ((2, 3, 4, 5), 2, (2, 3)),
     ((2, 3), -1, None),
     ((2, 3, 4), 'inf', 1),
     ((2, 3, 4), '-inf', (1, 0)),
     ((2, 3), None, (0, 1)),
     ((3, 2, 3), None, (1, 2)),
     ((2, 3), None, None),
     ((2, 3, 4), 'fro', (0, 2)),
     ((2, 0, 4), 'fro', (0, 2)),
     ((2, 3, 4), None, (0, 2)),
     ((2, 3, 4), -3.2, 2),
     ((2, 3, 4), -1, (0, 1)),
     ((2, 3, 4), 'inf', (0, 2)),
     ((2, 3, 4), '-inf', (0, 2)),
     ((4, 4, 4, 4), -2, (0, 2)),
     ((2, 3, 4), 'nuc', (0, 2)),
     ((2, 2), 'nuc', None),
 ])
 @pytest.mark.parametrize('hybridize', [True, False])
 @pytest.mark.parametrize('itype', [np.float32, np.float64])
 @pytest.mark.parametrize('keepdims', [True, False])
 def test_np_linalg_matrix_norm(shape, ord, axis, hybridize, itype, keepdims):
     class TestLinalgMatNorm(HybridBlock):
         def __init__(self, ord=None, axis=None, keepdims=False):
             super(TestLinalgMatNorm, self).__init__()
             self._ord = ord
             self._axis = axis
             self._keepdims = keepdims

         def forward(self, x):
             return np.linalg.matrix_norm(x, ord=self._ord, axis=self._axis, keepdims=self._keepdims)

     def spectral_norm_grad(data):
         with mx.autograd.record():
             UT, S, V = np.linalg.svd(data)
             norm = np.max(np.abs(S), axis=-1)
         norm.backward()
         return data.grad.asnumpy()

     # numpy is flaky under float16, also gesvd does not support fp16
     net = TestLinalgMatNorm(ord, axis, keepdims)
     rtol = 1e-2
     atol = 1e-2
     if hybridize:
         net.hybridize()
     a = mx.np.random.uniform(-10.0, 10.0, size=shape, dtype=itype)
     if not isinstance(axis, tuple) or not len(axis) == 2:
         assertRaises(ValueError, np.linalg.matrix_norm, a, ord, axis, keepdims)
         return
     a.attach_grad()
     with mx.autograd.record():
         mx_ret = net(a)
     if ord == 'inf':
         np_ret = onp.linalg.norm(a.asnumpy(), ord=onp.inf, axis=axis, keepdims=keepdims)
     elif ord == '-inf':
         np_ret = onp.linalg.norm(a.asnumpy(), ord=-onp.inf, axis=axis, keepdims=keepdims)
     else:
         np_ret = onp.linalg.norm(a.asnumpy(), ord=ord, axis=axis, keepdims=keepdims)

     assert np_ret.shape == mx_ret.shape
     assert_almost_equal(mx_ret.asnumpy(), np_ret, rtol=rtol, atol=atol)

     mx_ret.backward()

     grad_axis = axis
     if axis is None and len(shape) >= 2 and ord is not None:
         grad_axis = (len(shape) - 2, len(shape) - 1)
     elif axis is None and ord is None:
         grad_axis = tuple([i for i in range(len(shape))])
     elif axis is None:
         grad_axis = len(shape) - 1

     if not keepdims and isinstance(grad_axis, tuple):
         if len(grad_axis) == 2 and grad_axis[0] > grad_axis[1] and grad_axis[0] > len(np_ret.shape):
             grad_axis = (grad_axis[1], grad_axis[0])
         for i in grad_axis:
             np_ret = onp.expand_dims(np_ret, axis=i)
     elif not keepdims:
         np_ret = onp.expand_dims(np_ret, axis=grad_axis)

     if ord == 4:
         backward_expected = onp.sign(a.asnumpy()) * onp.power(onp.abs(a.asnumpy()) / np_ret, ord - 1)
         assert_almost_equal(a.grad.asnumpy(), backward_expected, rtol=rtol, atol=atol)

     if ord == 2 and not isinstance(grad_axis, tuple):
         backward_expected = onp.divide(a.asnumpy(), np_ret)
         assert_almost_equal(a.grad.asnumpy(), backward_expected, rtol=rtol, atol=atol)
     elif ord == 2 and isinstance(grad_axis, tuple):
         backward_expected = spectral_norm_grad(a)
         assert_almost_equal(a.grad.asnumpy(), backward_expected, rtol=rtol, atol=atol)

     if ord == 'fro':
         backward_expected = onp.divide(a.asnumpy(), np_ret)
         assert_almost_equal(a.grad.asnumpy(), backward_expected, rtol=rtol, atol=atol)

     assert a.grad.shape == a.shape

     # Test imperative once again
     if ord == 'inf':
         np_ret = onp.linalg.norm(a.asnumpy(), ord=onp.inf, axis=axis, keepdims=keepdims)
     elif ord == '-inf':
         np_ret = onp.linalg.norm(a.asnumpy(), ord=-onp.inf, axis=axis, keepdims=keepdims)
     else:
         np_ret = onp.linalg.norm(a.asnumpy(), ord=ord, axis=axis, keepdims=keepdims)
     mx_ret = np.linalg.matrix_norm(a, ord=ord, axis=axis, keepdims=keepdims)
     assert_almost_equal(mx_ret.asnumpy(), np_ret, rtol=rtol, atol=atol)


 @use_np
 @pytest.mark.parametrize('shape', [
     (3, 3),
     (3, 5),
     (4, 4),
     (4, 5),
     (5, 5),
     (5, 6),
     (6, 6),
     (0, 1),
     (6, 5, 6),
     (2, 3, 3, 4),
     (4, 2, 1, 2),
     (0, 5, 3, 3),
     (5, 0, 3, 3),
     (3, 3, 0, 0),
 ])
 @pytest.mark.parametrize('dtype', ['float32', 'float64'])
 @pytest.mark.parametrize('hybridize', [False, True])
 def test_np_linalg_svd(shape, dtype, hybridize):
     class TestSVD(HybridBlock):
         def __init__(self):
             super(TestSVD, self).__init__()

         def forward(self, data):
             return np.linalg.svd(data)

     def get_grad(UT, L, V):
         m = V.shape[-2]
         n = V.shape[-1]
         E = onp.zeros_like(UT)
         dUT = onp.ones_like(UT)
         dV = onp.ones_like(V)
         for i in range(m):
             for j in range(i + 1, m):
                 denom1 = onp.maximum(L[..., i] - L[..., j], 1e-20)
                 denom2 = onp.maximum(L[..., i] + L[..., j], 1e-20)
                 E[..., i, j] = 1.0 / denom1 / denom2
                 E[..., j, i] = -E[..., i, j]
             E[..., i, i] = 0
         G1 = onp.matmul(1.0 / L[..., None] * dV, onp.swapaxes(V, -2, -1)) * L[..., None, :]
         G1 = G1 + onp.matmul(onp.swapaxes(dUT, -2, -1), UT)
         X = G1 * E
         G2 = onp.eye(m) + (X + onp.swapaxes(X, -2, -1)) * L[..., None, :] - 1.0 / L[..., None] * onp.matmul(dV, onp.swapaxes(V, -2, -1)) * onp.eye(m)
         dA = onp.matmul(UT, onp.matmul(G2, V) + 1.0 / L[..., None] * dV)
         return dA

     def check_svd(UT, L, V, data_np):
         shape = data_np.shape
         # check UT @ L @ V == A
         t = onp.matmul(UT * L[..., None, :], V)
         assert t.shape == data_np.shape
         assert_almost_equal(t, data_np, rtol=rtol, atol=atol)
         # check UT @ U == I
         I = onp.matmul(UT, onp.swapaxes(UT, -2, -1))
         I_np = onp.ones_like(UT) * onp.eye(shape[-2])
         assert I.shape == I_np.shape
         assert_almost_equal(I, I_np, rtol=rtol, atol=atol)
         # check U @ UT == I
         I = onp.matmul(onp.swapaxes(UT, -2, -1), UT)
         I_np = onp.ones_like(UT) * onp.eye(shape[-2])
         assert I.shape == I_np.shape
         assert_almost_equal(I, I_np, rtol=rtol, atol=atol)
         # check V @ VT == I
         I = onp.matmul(V, onp.swapaxes(V, -2, -1))
         I_np = onp.ones_like(UT) * onp.eye(shape[-2])
         assert I.shape == I_np.shape
         assert_almost_equal(I, I_np, rtol=rtol, atol=atol)

     rtol = atol = 0.01
     test_svd = TestSVD()
     if hybridize:
         test_svd.hybridize()
     data_np = onp.random.uniform(-10.0, 10.0, shape)
     data_np = onp.array(data_np, dtype=dtype)
     data = np.array(data_np, dtype=dtype)
     if effective_dtype(data) == onp.dtype(np.float16):
         pytest.skip()
     data.attach_grad()
     with mx.autograd.record():
         ret = test_svd(data)
     UT = ret[0].asnumpy()
     L = ret[1].asnumpy()
     V = ret[2].asnumpy()
     # check svd validity
     check_svd(UT, L, V, data_np)
     # check descending singular values
     s = [L[..., i] - L[..., i + 1] for i in range(L.shape[-1] - 1)]
     s = onp.array(s)
     assert (s >= -1e-5).all()
     if L.size > 0:
         assert (L[..., -1] >= -1e-5).all()
     # check backward
     mx.autograd.backward(ret)
     if ((s > 1e-5).all() and (L.size == 0 or (L > 1e-5).all())):
         backward_expected = get_grad(ret[0].asnumpy(), ret[1].asnumpy(), ret[2].asnumpy())
         assert_almost_equal(data.grad.asnumpy(), backward_expected, rtol=rtol, atol=atol)
     # Test imperative once again
     ret = np.linalg.svd(data)
     UT = ret[0].asnumpy()
     L = ret[1].asnumpy()
     V = ret[2].asnumpy()
     check_svd(UT, L, V, data_np)


 @use_np
 @pytest.mark.parametrize('shape', [
     (3, 3),
     (3, 5),
     (4, 4),
     (4, 5),
     (5, 5),
     (5, 6),
     (6, 6),
     (0, 1),
     (6, 5, 6),
     (2, 3, 3, 4),
     (4, 2, 1, 2),
     (0, 5, 3, 3),
     (5, 0, 3, 3),
     (3, 3, 0, 0),
 ])
 @pytest.mark.parametrize('dtype', ['float32', 'float64'])
 @pytest.mark.parametrize('hybridize', [False, True])
 def test_np_linalg_svdvals(shape, dtype, hybridize):
     class TestSVD(HybridBlock):
         def __init__(self):
             super(TestSVD, self).__init__()

         def forward(self, data):
             return np.linalg.svdvals(data)

     rtol = atol = 0.01
     test_svd = TestSVD()
     if hybridize:
         test_svd.hybridize()
     data_np = onp.random.uniform(-10.0, 10.0, shape)
     data_np = onp.array(data_np, dtype=dtype)
     data = np.array(data_np, dtype=dtype)
     if effective_dtype(data) == onp.dtype(np.float16):
         pytest.skip()
     mx_out = test_svd(data)
     np_out = onp.linalg.svd(data, compute_uv=False)
     # check svdvals validity
     assert_almost_equal(mx_out.asnumpy(), np_out, rtol=rtol, atol=atol)
     # Test imperative once again
     mx_out = np.linalg.svdvals(data)
     assert_almost_equal(mx_out.asnumpy(), np_out, rtol=rtol, atol=atol)


 @use_np
 def test_np_linalg_qr():
     class TestQR(HybridBlock):
         def __init__(self):
             super(TestQR, self).__init__()

         def forward(self, data):
             return np.linalg.qr(data)

     def get_expected_grad(a, q, r, dq, dr):
         # for all input shapes (..., m, n)
         if 0 in r.shape:
             return r
         def _copyltu(M):
             eye = onp.array([onp.eye(M.shape[-1]) for i in range(M.shape[0])])
             lower = onp.tril(M) - eye * M
             lower_mask = onp.tril(onp.ones_like(M))
             ret = lower_mask * M + lower.swapaxes(-1, -2)
             return ret
         def _case_m_ge_n(a, q, r, dq, dr):
                 dq_t = dq.swapaxes(-1, -2)
                 dr_t = dr.swapaxes(-1, -2)
                 r_inv = onp.linalg.inv(r)
                 r_inv_t = r_inv.swapaxes(-1, -2)
                 r_t = r.swapaxes(-1, -2)
                 # Get M
                 M = onp.matmul(r, dr_t) - onp.matmul(dq_t, q)
                 da = onp.matmul(dq + onp.matmul(q, _copyltu(M)), r_inv_t)
                 return da
         m, n = a.shape[-2], a.shape[-1]
         x = a[..., :, :m]
         x_shape = x.shape
         y = a[..., :, m:]
         y_shape = y.shape
         u = r[..., :, :m]
         v = r[..., :, m:]
         dv = dr[..., :, m:]
         du = dr[..., :, :m]
         q = q.reshape(-1, q.shape[-2], q.shape[-1])
         u = u.reshape(-1, u.shape[-2], u.shape[-1])
         dq = dq.reshape(-1, q.shape[-2], q.shape[-1])
         du = du.reshape(-1, du.shape[-2], du.shape[-1])
         if m >= n:
             dx = _case_m_ge_n(x, q, u, dq, du).reshape(x_shape)
             return dx
         else:
             dv = dv.reshape(-1, dv.shape[-2], dv.shape[-1])
             y = y.reshape(-1, y.shape[-2], y.shape[-1])
             dy = onp.matmul(q, dv).reshape(y_shape)
             dq_prime = dq + onp.matmul(y, dv.swapaxes(-1, -2))
             dx = _case_m_ge_n(x, q, u, dq_prime, du).reshape(x_shape)
             da = onp.concatenate([dx, dy], axis=-1)
             return da

     def well_conditioned_rectang_matrix_2D(shape, ran=(-1., 1.), max_cond=4):
         m, n = shape[-2], shape[-1]
         while 1:
             Q1, R1 = onp.linalg.qr(onp.random.uniform(ran[0], ran[1], (m, m)))
             D = onp.eye(m, n)
             Q2, R2 = onp.linalg.qr(onp.random.uniform(ran[0], ran[1], (n, n)))
             a = onp.matmul(onp.matmul(Q1, D), onp.swapaxes(Q2, -1, -2))
             if (onp.linalg.cond(a, 2) < max_cond):
                 return a

     def well_conditioned_rectang_matrix_nD(shape, ran=(-1., 1.), max_cond=4):
         p = int(onp.prod(shape[:-2])) if len(shape) > 2 else 1
         return onp.array([well_conditioned_rectang_matrix_2D(shape, ran, max_cond) for i in range(p)]).reshape(shape)

     def check_qr(q, r, a_np):
         # check Q@R = A
         t = onp.matmul(q, r)
         assert t.shape == a_np.shape
         assert_almost_equal(t, a_np, rtol=rtol, atol=atol)
         # check QT@Q = I
         qT = onp.swapaxes(q, -2, -1)
         I = onp.matmul(qT, q)
         Ip = onp.eye(I.shape[-2])
         assert_almost_equal(I, Ip, atol=atol, rtol=rtol)
         # check original numpy
         try:
             q_expected, r_expected = onp.linalg.qr(a_np)
         except Exception as e:
             print("a_np", a_np)
             print("a shape:", a_np.shape)
             print(e)
         else:
             assert q.shape == q_expected.shape
             assert r.shape == r_expected.shape
             assert_almost_equal(q.asnumpy(), q_expected, rtol=rtol, atol=atol)
             assert_almost_equal(r.asnumpy(), r_expected, rtol=rtol, atol=atol)
     shapes = [
         (3, 5),
         (5, 3),
         (10, 10),
         (0, 1),
         (6, 5, 6),
         (6, 6, 5),
         (2, 3, 2, 3),
         (2, 3, 3, 2),
         (5, 0, 3, 3),
         (3, 3, 0, 0),
     ]
     dtypes = ['float64', 'float32']
     for hybridize, shape, dtype in itertools.product([False, True], shapes, dtypes):
         rtol = atol = 1e-2
         if dtype == 'float32':
             rtol = atol = 3e-2

         test_qr = TestQR()
         if hybridize:
             test_qr.hybridize()
         if 0 in shape:
             data_np = onp.ones(shape)
         else:
             data_np = well_conditioned_rectang_matrix_nD(shape, max_cond=4)

         data_np = onp.array(data_np, dtype=dtype)
         data = np.array(data_np, dtype=dtype)
         if effective_dtype(data) == onp.dtype(np.float16):
             print('Skipping test on this platform: {} has a float16 effective dtype'.format(dtype))
             pytest.skip()

         data.attach_grad()
         with mx.autograd.record():
             ret = test_qr(data)
         Q, R = ret[0], ret[1]
         check_qr(Q, R, data_np)

         if 0 not in R.shape:
             assert data.grad.shape == data_np.shape
             backward_expected = get_expected_grad(data_np, Q.asnumpy(), R.asnumpy(),
                                                   onp.ones(Q.shape), onp.ones(R.shape))
             mx.autograd.backward(ret)
             assert_almost_equal(data.grad.asnumpy(), backward_expected, rtol=rtol, atol=atol)

         # check imperative once more; mode='reduced' is default
         # behavior and optional parameter in original numpy
         ret = np.linalg.qr(data, mode='reduced')
         Q, R = ret[0], ret[1]
         check_qr(Q, R, data_np)


 @use_np
 @pytest.mark.parametrize('shape', [
     (0, 0),
     (1, 1),
     (5, 5),
     (6, 6),
     (10, 10),
     (6, 6, 6),
     (1, 0, 0),
     (0, 1, 1),
     (2, 3, 4, 4),
 ])
 @pytest.mark.parametrize('dtype', ['float32', 'float64'])
 @pytest.mark.parametrize('upper', [True, False])
 @pytest.mark.parametrize('hybridize', [True, False])
 def test_np_linalg_cholesky(shape, dtype, upper, hybridize):
     class TestCholesky(HybridBlock):
         def __init__(self, upper=False):
             super(TestCholesky, self).__init__()
             self._upper = upper

         def forward(self, data):
             return np.linalg.cholesky(data, upper=self._upper)

     def get_grad(L, upper):
         # shape of m is [batch, n, n]
         if 0 in L.shape:
             return L

         if upper:
             L = onp.swapaxes(L, -1, -2)

         def copyltu(m):
             eye = onp.array([onp.eye(m.shape[-1]) for i in range(m.shape[0])])
             lower = onp.tril(m) - eye * m
             lower_mask = onp.tril(onp.ones_like(m))
             ret = lower_mask * m + lower.swapaxes(-1, -2)
             return ret

         shape = L.shape
         L = L.reshape(-1, shape[-2], shape[-1])
         dL = onp.ones_like(L)
         L_inv = onp.linalg.inv(L)
         L_inv_T = L_inv.swapaxes(-1, -2)
         L_T = L.swapaxes(-1, -2)
         sym_L_inv = 0.5 * (L_inv + L_inv_T)
         dA = 0.5 * onp.matmul(onp.matmul(L_inv_T, copyltu(onp.matmul(L_T, dL))), L_inv)
         return dA.reshape(shape)

     def check_cholesky(L, data_np, upper):
         assert L.shape == data_np.shape
         # catch error if numpy throws rank < 2
         try:
             if upper:
                 L_expected = onp.swapaxes(onp.linalg.cholesky(data_np), -1, -2)
             else:
                 L_expected = onp.linalg.cholesky(data_np)
         except Exception as e:
             print(data_np)
             print(data_np.shape)
             print(e)
         else:
             assert L.shape == L_expected.shape
             assert_almost_equal(L.asnumpy(), L_expected, rtol=rtol, atol=atol)

     def newSymmetricPositiveDefineMatrix_2D(shape, ran=(0., 10.), max_cond=4):
         while 1:
             D = onp.diag(onp.random.uniform(ran[0], ran[1], shape[-1]))
             I = onp.eye(shape[-1]).reshape(shape)
             v = onp.random.uniform(-1., 1., shape[-1]).reshape(shape[:-1] + (1,))
             v = v / onp.linalg.norm(v, axis=-2, keepdims=True)
             v_T = onp.swapaxes(v, -1, -2)
             U = I - 2 * onp.matmul(v, v_T)
             a = onp.matmul(onp.matmul(U, D), onp.swapaxes(U, -1, -2))
             if (onp.linalg.cond(a, 2) < max_cond):
                 return a

     def newSymmetricPositiveDefineMatrix_nD(shape, ran=(0., 10.), max_cond=4):
         n = int(onp.prod(shape[:-2])) if len(shape) > 2 else 1
         return onp.array([newSymmetricPositiveDefineMatrix_2D(shape[-2:], ran, max_cond) for i in range(n)]).reshape(shape)


     rtol = 1e-3
     atol = 1e-5
     if dtype == 'float32':
         rtol = 1e-2
         atol = 1e-4

     test_cholesky = TestCholesky(upper)
     if hybridize:
         test_cholesky.hybridize()

     # Numerical issue:
     # When backpropagating through Cholesky decomposition, we need to compute the inverse
     # of L according to dA = 0.5 * L**(-T) * copyLTU(L**T * dL) * L**(-1) where A = LL^T.
     # The inverse is calculated by "trsm" method in CBLAS. When the data type is float32,
     # this causes numerical instability. It happens when the matrix is ill-conditioned.
     # In this example, the issue occurs frequently if the symmetric positive definite input
     # matrix A is constructed by A = LL^T + \epsilon * I. A proper way of testing such
     # operators involving numerically unstable operations is to use well-conditioned random
     # matrices as input. Here we test Cholesky decomposition for FP32 and FP64 separately.
     # See rocBLAS:
     # https://github.com/ROCmSoftwarePlatform/rocBLAS/wiki/9.Numerical-Stability-in-TRSM

     # generate symmetric PD matrices
     if 0 in shape:
         data_np = np.ones(shape)
     else:
         data_np = newSymmetricPositiveDefineMatrix_nD(shape)

     # When dtype is np.FP32, truncation from FP64 to FP32 could also be a source of
     # instability since the ground-truth gradient is computed using FP64 data.
     data = np.array(data_np, dtype=dtype)
     data.attach_grad()
     with mx.autograd.record():
         L = test_cholesky(data)

     # check cholesky validity
     check_cholesky(L, data_np, upper)
     # check backward. backward does not support empty input
     if 0 not in L.shape:
         mx.autograd.backward(L)
         backward_expected = get_grad(L.asnumpy(), upper)
         assert_almost_equal(data.grad.asnumpy(), backward_expected, rtol=rtol, atol=atol)
     # check imperative once again
     L = np.linalg.cholesky(data, upper=upper)
     check_cholesky(L, data_np, upper)


 @use_np
 @pytest.mark.parametrize('hybridize', [True, False])
 @pytest.mark.parametrize('dtype', ['float32', 'float64'])
 @pytest.mark.parametrize('shape', [
     (0, 0),
     (4, 4),
     (2, 2),
     (1, 1),
     (2, 1, 1),
     (0, 1, 1),
     (6, 1, 1),
     (2, 3, 3, 3),
     (4, 2, 1, 1),
     (0, 5, 3, 3),
     (5, 0, 0, 0),
     (3, 3, 0, 0),
     (3, 5, 5),
 ])
 @retry(3)
 def test_np_linalg_inv(hybridize, dtype, shape):
     class TestInverse(HybridBlock):
         def __init__(self):
             super(TestInverse, self).__init__()

         def forward(self, data):
             return np.linalg.inv(data)

     def get_grad(A):
         if 0 in A.shape:
             return A

         dA = onp.ones_like(A)
         A_inv = onp.linalg.inv(A)
         dA_inv = -onp.matmul(onp.matmul(A_inv, dA), A_inv)
         return onp.swapaxes(dA_inv, -1, -2)

     def check_inv(A_inv, data_np):
         assert A_inv.shape == data_np.shape
         # catch error if numpy throws rank < 2
         try:
             A_expected = onp.linalg.inv(data_np)
         except Exception as e:
             print(data_np)
             print(data_np.shape)
             print(e)
         else:
             assert A_inv.shape == A_expected.shape
             assert_almost_equal(A_inv.asnumpy(), A_expected, rtol=rtol, atol=atol)

     atol = rtol = 1e-2

     test_inv = TestInverse()
     if hybridize:
         test_inv.hybridize()
     # generate well-conditioned matrices with small eigenvalues
     if 0 in shape:
         data_np = onp.ones(shape)
     else:
         n = int(np.prod(np.array(shape[:-2]))) if len(shape) > 2 else 1
         # eigenvalues
         D = onp.array([onp.diag(onp.random.uniform(-10., 10., shape[-1])) \
                          for i in range(n)]).reshape(shape)
         # orthogonal matrix through householder transformation
         I = onp.array([onp.eye(shape[-1]) for i in range(n)]).reshape(shape)
         v = onp.random.uniform(-10, 10,
                 int(np.prod(np.array(shape[:-1])))).reshape(shape[:-1] + (1,))
         v = v / onp.linalg.norm(v, axis=-2, keepdims=True)
         v_T = onp.swapaxes(v, -1, -2)
         U = I - 2 * onp.matmul(v, v_T)
         data_np = onp.matmul(onp.matmul(U, D), onp.swapaxes(U, -1, -2))
     data = np.array(data_np, dtype=dtype)
     data.attach_grad()
     with mx.autograd.record():
         A_inv = test_inv(data)

     # check cholesky validity
     check_inv(A_inv, data_np)
     # check backward. backward does not support empty input
     mx.autograd.backward(A_inv)
     backward_expected = get_grad(data.asnumpy())
     assert_almost_equal(data.grad.asnumpy(), backward_expected, rtol=rtol, atol=atol)
     # check imperative once again
     A_inv = np.linalg.inv(data)
     check_inv(A_inv, data_np)


 @use_np
 def test_np_linalg_solve():
     class TestSolve(HybridBlock):
         def __init__(self):
             super(TestSolve, self).__init__()

         def forward(self, a, b):
             return np.linalg.solve(a, b)

     def check_solve(x, a_np, b_np):
         try:
             x_expected = onp.linalg.solve(a_np, b_np)
         except Exception as e:
             print("a:", a_np)
             print("a shape:", a_np.shape)
             print("b", b_np)
             print("b shape:", b_np.shape)
             print(e)
         else:
             assert x.shape == x_expected.shape
             assert_almost_equal(x, x_expected)

     def newInvertibleMatrix_2D(shape, max_cond=4):
         while 1:
             # generate well-conditioned matrices with small eigenvalues
             D = onp.diag(onp.random.uniform(-1.0, 1.0, shape[-1]))
             I = onp.eye(shape[-1]).reshape(shape)
             v = onp.random.uniform(-10., 10., shape[-1]).reshape(shape[:-1] + (1,))
             v = v / onp.linalg.norm(v, axis=-2, keepdims=True)
             v_T = onp.swapaxes(v, -1, -2)
             U = I - 2 * onp.matmul(v, v_T)
             a = onp.matmul(U, D)
             if (onp.linalg.cond(a, 2) < max_cond):
                 return a

     def newInvertibleMatrix_nD(shape, max_cond=4):
         n = int(np.prod(np.array(shape[:-2]))) if len(shape) > 2 else 1
         return onp.array([newInvertibleMatrix_2D(shape[-2:]) for i in range(n)]).reshape(shape)

     def get_grad_b(A, X):
         dX = onp.ones_like(X)
         A_inv = onp.linalg.inv(A)
         A_inv_trans = onp.swapaxes(A_inv, -1, -2)
         return onp.matmul(A_inv_trans, dX)

     shapes = [
         (0, 0),
         (1, 1),
         (3, 3),
         (4, 4),
         (3, 2, 2),
         (1, 0, 0),
         (0, 1, 1),
         (0, 5, 3, 3),
         (5, 0, 0, 0),
         (2, 2, 5, 5)
     ]
     nrhs = (-1, 0, 1, 2, 3)
     dtypes = ['float32', 'float64']
     for hybridize, shape, dtype, nrh in itertools.product([False, True], shapes, dtypes, nrhs):
         test_solve = TestSolve()
         if hybridize:
             test_solve.hybridize()

         if 0 in shape:
             a = onp.ones(shape)
             b = onp.ones(shape)
         else:
             shape_a = shape
             shape_b = list(shape_a)
             if nrh == -1:
                 shape_b[-1] = 1
             else :
                 shape_b[-1] = nrh
             a = newInvertibleMatrix_nD(shape_a)
             x = onp.random.randn(*shape_b)
             b = onp.matmul(a, x)
         a = np.array(a, dtype=dtype)
         b = np.array(b, dtype=dtype)
         a.attach_grad()
         b.attach_grad()
         with mx.autograd.record():
             mx_out = test_solve(a, b)
         # check solve validity
         assert mx_out.shape == b.shape
         check_solve(mx_out, a, b)

         # check backward. backward does not support empty input
         if 0 not in mx_out.shape:
             if nrh != -1:
                 mx.autograd.backward(mx_out)
                 b_backward_expected = get_grad_b(a.asnumpy(), mx_out.asnumpy())
                 a_backward_expected = -onp.matmul(b_backward_expected, onp.swapaxes(mx_out, -1, -2).asnumpy())
                 assert_almost_equal(a.grad, a_backward_expected)
                 assert_almost_equal(b.grad, b_backward_expected)

         # check imperative once again
         mx_out = np.linalg.solve(a, b)
         check_solve(mx_out, a, b)


 def test_np_linalg_tensorinv():
     class TestTensorinv(HybridBlock):
         def __init__(self, ind=2):
             super(TestTensorinv, self).__init__()
             self._ind = ind

         def forward(self, a):
             return np.linalg.tensorinv(a, ind=self._ind)

     def check_tensorinv(inv_a, a_np, ind):
         try:
             inv_a_expected = onp.linalg.tensorinv(a_np, ind=ind)
         except Exception as e:
             print(a_np)
             print(a_np.shape)
             print(e)
         else:
             assert inv_a.shape == inv_a_expected.shape
             assert_almost_equal(inv_a, inv_a_expected)

     def newInvertibleMatrix_2D(shape, max_cond=4):
         while 1:
             # generate well-conditioned matrices with small eigenvalues
             D = onp.diag(onp.random.uniform(-1.0, 1.0, shape[-1]))
             I = onp.eye(shape[-1]).reshape(shape)
             v = onp.random.uniform(-10., 10., shape[-1]).reshape(shape[:-1] + (1,))
             v = v / onp.linalg.norm(v, axis=-2, keepdims=True)
             v_T = onp.swapaxes(v, -1, -2)
             U = I - 2 * onp.matmul(v, v_T)
             a = onp.matmul(U, D)
             if (onp.linalg.cond(a, 2) < max_cond):
                 return a

     def get_grad_A(A, ind):
         inv_A = onp.linalg.tensorinv(A, ind)
         d_inv_A = onp.ones_like(inv_A)
         axes1 = len(A.shape) - ind
         axes2 = ind
         inv_A_trans_axes = tuple(onp.arange(len(A.shape)))[axes1:] + tuple(onp.arange(len(A.shape)))[:axes1]
         inv_A_trans = onp.transpose(inv_A, inv_A_trans_axes)
         temp_tensor = -onp.tensordot(inv_A_trans, d_inv_A, axes = axes1)
         return onp.tensordot(temp_tensor, inv_A_trans, axes = axes2)

     shapes = [
         (1, 1, 1),
         (1, 2, 2),
         (1, 6, 2, 3),
         (1, 10, 2, 5),
         (1, 12, 3, 4),
         (2, 1, 1),
         (2, 1, 1, 1),
         (2, 2, 5, 5, 2),
         (2, 1, 6, 3, 2),
         (2, 1, 8, 4, 2),
         (2, 12, 1, 3, 4, 1),
         (3, 1, 1, 1),
         (3, 2, 3, 1, 6),
         (3, 3, 2, 1, 2, 3, 1)
     ]
     dtypes = ['float32', 'float64']
     for hybridize, shape, dtype, in itertools.product([False, True], shapes, dtypes):
         ind = shape[0]
         test_tensorinv = TestTensorinv(ind=ind)
         if hybridize:
             test_tensorinv.hybridize()

         prod_front = 1
         prod_back = 1
         for k in shape[1:ind + 1]:
             prod_front *= k
         for k in shape[1 + ind:]:
             prod_back *= k
         a_shape = (prod_back, prod_front)
         a = newInvertibleMatrix_2D(a_shape)
         a_shape = shape[1:]
         inv_a_shape = shape[(1 + ind):] + shape[1:(ind + 1)]
         a = np.array(a.reshape(a_shape), dtype=dtype)
         a.attach_grad()
         with mx.autograd.record():
             mx_out = test_tensorinv(a)
         # check tensorinv validity
         assert mx_out.shape == inv_a_shape
         check_tensorinv(mx_out, a, ind)

         # check tensorinv backward
         if 0 not in mx_out.shape:
             mx.autograd.backward(mx_out)
             grad_A_expected = get_grad_A(a.asnumpy(), ind)
             assert_almost_equal(a.grad, grad_A_expected)

     # check imperative once again
     mx_out = np.linalg.tensorinv(a, ind)
     check_tensorinv(mx_out, a, ind)


 @use_np
 def test_np_linalg_tensorsolve():
     class TestTensorsolve(HybridBlock):
         def __init__(self, axes):
             super(TestTensorsolve, self).__init__()
             self._axes = axes

         def forward(self, a, b):
             return np.linalg.tensorsolve(a, b, axes=self._axes)

     def get_tensorsolve_backward(a_np, b_np, mx_out_np, a_axes, a_origin_axes, a_trans_shape):
         if (a_np.ndim == 0 or b_np.ndim == 0) or (a_np.ndim == b_np.ndim):
             a_shape = a_np.shape
             b_shape = b_np.shape
             a_np = a_np.reshape((1, 1))
             b_np = b_np.reshape((1,))
             mx_out_np = mx_out_np.reshape((1,))
             dx = onp.ones_like(mx_out_np)
             inv_a_temp_np = onp.linalg.inv(a_np)
             grad_b = inv_a_temp_np[0][0] * dx[0]
             grad_a = -grad_b * mx_out_np[0]
             return grad_a.reshape(a_shape), grad_b.reshape(b_shape)
         else:
             dx = onp.ones_like(mx_out_np)
             a_np = a_np.transpose(a_axes)
             ind = a_np.ndim - mx_out_np.ndim
             tensorinv_a_np = onp.linalg.tensorinv(a_np, ind=ind)
             a_trans_axes = list(range(a_np.ndim))[a_np.ndim - ind:] + list(range(a_np.ndim))[:a_np.ndim - ind]
             trans_tensorinv_a_np = tensorinv_a_np.transpose(a_trans_axes)
             grad_b = onp.tensordot(trans_tensorinv_a_np, dx, axes=dx.ndim)
             grad_a = onp.tensordot(grad_b, mx_out_np, axes=0)
             grad_a = grad_a.transpose(a_origin_axes)
             return -grad_a, grad_b.reshape(b_np.shape)

     def check_tensorsolve(x, a_np, b_np, axes):
         try:
             x_expected = onp.linalg.tensorsolve(a_np, b_np, axes=axes)
         except Exception as e:
             print("a:", a_np)
             print("a shape:", a_np.shape)
             print("b", b_np)
             print("b shape:", b_np.shape)
             print(e)
         else:
             assert x.shape == x_expected.shape
             assert_almost_equal(x, x_expected)

     def shapeInfer(a_shape, b_shape, axes=None):
         # b_shape - Right-hand tensor shape, which can be of any shape.
         a_ndim = len(a_shape)
         b_ndim = len(b_shape)
         a_trans_shape = list(a_shape)
         a_axes = list(range(0, a_ndim))
         if axes is not None:
             for k in axes:
                 a_axes.remove(k)
                 a_axes.insert(a_ndim, k)
             for k in range(a_ndim):
                 a_trans_shape[k] = a_shape[a_axes[k]]
         x_shape = a_trans_shape[-(a_ndim - b_ndim):]
         prod = 1
         for k in x_shape:
             prod *= k
         if prod * prod != onp.prod(a_shape):
             raise ValueError("a is not square")
         if prod != onp.prod(b_shape):
             raise ValueError("a's shape and b's shape dismatch")
         return a_axes, (prod, prod), tuple(a_trans_shape), tuple(x_shape)

     def newInvertibleMatrix_2D(shape, max_cond=4):
         while 1:
             # generate well-conditioned matrices with small eigenvalues
             D = onp.diag(onp.random.uniform(-1.0, 1.0, shape[-1]))
             I = onp.eye(shape[-1]).reshape(shape)
             v = onp.random.uniform(-1., 1., shape[-1]).reshape(shape[:-1] + (1,))
             v = v / onp.linalg.norm(v, axis=-2, keepdims=True)
             v_T = onp.swapaxes(v, -1, -2)
             U = I - 2 * onp.matmul(v, v_T)
             a = onp.matmul(U, D)
             if (onp.linalg.cond(a, 2) < max_cond):
                 return a

     shapes = [
         # a_shape.ndim <= 6,
         # (a_shape, b_shape, axes)
         ((), (), None),                     # a.ndim == 0, b.ndim == 0, with axes must be None
         ((), (1, 1, 1), None),              # a.ndim == 0, b.ndim != 0, with axes must be None
         ((1, 1, 1), (), None),              # a.ndim != 0, b.ndim == 0, with axes == None
         ((1, 1, 1), (), (0, 1, 2)),         # a.ndim != 0, b.ndim == 0, with axes != None
         ((1, 1, 1), (1, 1, 1), None),       # a.ndim != 0, b.ndim != 0, a.ndim == b.ndim with axes == None
         ((1, 1, 1), (1, 1, 1), (2, 0, 1)),  # a.ndim != 0, b.ndim != 0, a.ndim == b.ndim with axes != None
         ((1, 1), (1,), None),               # a.ndim != 0, b.ndim != 0, a.ndim > b.ndim
         ((1, 1), (1, 1, 1, 1, 1), None),    # a.ndim != 0, b.ndim != 0, a.ndim < b.ndim - a.ndim
         ((4, 4), (4,), None),
         ((6, 2, 3), (6,), None),
         ((2, 3, 6), (6,), (0, 1)),
         ((3, 4, 2, 3, 2), (3, 4), None),
         ((2, 1, 4, 2, 4), (2, 4), (0, 1, 2)),
         ((2, 3, 3, 4, 2), (3, 4), (0, 2, 4)),
         ((1, 3, 3, 4, 4), (1, 3, 4), (1, 3)),
         ((1, 12, 4, 1, 3), (1, 2, 1, 2, 1, 3, 1), None),
         ((1, 4, 1, 12, 3), (1, 2, 1, 2, 1, 3, 1), (1, 2, 4)),
     ]
     dtypes = ['float32', 'float64']
     for hybridize in [True, False]:
         for dtype in dtypes:
             for a_shape, b_shape, axes in shapes:
                 test_tensorsolve = TestTensorsolve(axes)
                 if hybridize:
                     test_tensorsolve.hybridize()

                 a_axes, mat_shape, a_trans_shape, x_shape = shapeInfer(a_shape, b_shape, axes)
                 # generate coefficient tensor a and right side tensor b
                 if (len(a_shape) == 0 or len(b_shape) == 0) or (len(a_shape) == len(b_shape)):
                     a_np = onp.asarray(1).astype(dtype).reshape(a_shape)
                     b_np = onp.asarray(2).astype(dtype).reshape(b_shape)
                 else:
                     a_np = newInvertibleMatrix_2D(mat_shape, max_cond=3).reshape(a_trans_shape)
                     x_np = onp.random.randn(*x_shape)
                     b_np = onp.tensordot(a_np, x_np, axes=len(x_shape))

                 # resume original shape of tensor a
                 a_origin_axes = list(range(a_np.ndim))
                 if axes is not None:
                     for k in range(a_np.ndim):
                         a_origin_axes[a_axes[k]] = k
                 a_np = a_np.transpose(a_origin_axes)
                 a = np.array(a_np, dtype=dtype).reshape(a_shape)
                 b = np.array(b_np, dtype=dtype).reshape(b_shape)
                 a.attach_grad()
                 b.attach_grad()

                 with mx.autograd.record():
                     mx_out = test_tensorsolve(a, b)
                 # check tensorsolve validity
                 assert mx_out.shape == x_shape
                 check_tensorsolve(mx_out, a.asnumpy(), b.asnumpy(), axes)

                 # check backward
                 if len(a_shape) != 0 and len(b_shape) != 0:
                     mx.autograd.backward(mx_out)
                     grad_a_expected, grad_b_expected = get_tensorsolve_backward(
                         a.asnumpy(), b.asnumpy(), mx_out.asnumpy(), a_axes, a_origin_axes, a_trans_shape)
                     assert_almost_equal(a.grad, grad_a_expected)
                     assert_almost_equal(b.grad, grad_b_expected)

                 # check imperative once again
                 mx_out = test_tensorsolve(a, b)
                 check_tensorsolve(mx_out, a.asnumpy(), b.asnumpy(), axes)


 @use_np
 def test_np_linalg_lstsq():
     class TestLstsq(HybridBlock):
         def __init__(self, rcond):
             super(TestLstsq, self).__init__()
             self._rcond = rcond

         def forward(self, a, b, rcond='warn'):
             return np.linalg.lstsq(a, b, rcond=self._rcond)

     def check_lstsq(a_np, b_np, rcond_np, x, residuals, rank, s):
         try:
             if rcond_np == 'warn':
                 rcond_np = -1
             x_expected, residuals_expected, rank_expected, s_expected = onp.linalg.lstsq(a_np, b_np, rcond_np)
         except Exception as e:
             print("a:", a_np)
             print("a shape:", a_np.shape)
             print("b:", b_np)
             print("b shape:", b_np.shape)
             print(e)
         else:
             assert x.shape == x_expected.shape
             assert residuals.shape == residuals_expected.shape
             assert rank.shape == rank_expected.shape
             assert s.shape == s_expected.shape
             assert_almost_equal(x.asnumpy(), x_expected, rtol=rtol, atol=atol)
             assert_almost_equal(residuals.asnumpy(), residuals_expected, rtol=rtol, atol=atol)
             assert_almost_equal(rank.asnumpy(), rank_expected, rtol=rtol, atol=atol)
             assert_almost_equal(s.asnumpy(), s_expected, rtol=rtol, atol=atol)

     shapes = [
         ((4, 0), (4,)),   # ncol == 0
         ((4, 0), (4, 2)), # ncol == 0
         ((0, 2), (0,)),   # nrow == 0
         ((0, 2), (0, 4)), # nrow == 0
         ((4, 2), (4, 0)), # nrhs == 0
         ((4, 4), (4, 0)), # nrhs == 0
         ((4, 6), (4, 0)), # nrhs == 0
         ((0, 0), (0, 4)), # nrow == 0, ncol == 0
         ((0, 2), (0, 0)), # nrow == 0, nrhs == 0
         ((4, 0), (4, 0)), # ncol == 0, nrhs == 0
         ((0, 0), (0,)),   # nrow == 0, ncol == 0, nrhs = none
         ((0, 0), (0, 0)), # nrow == 0, ncol == 0, nrhs = 0
         ((2, 1), (2,)),
         ((4, 1), (4,)),
         ((4, 2), (4,)),
         ((4, 4), (4,)),
         ((1, 4), (1, 4)),
         ((4, 2), (4, 1)),
         ((4, 2), (4, 3)),
         ((4, 4), (4, 3)),
         ((4, 6), (4, 3)),
     ]
     rconds = [None, "random", "warn"]
     dtypes = ['float32', 'float64']
     for rcond, hybridize in itertools.product(rconds, [True, False]):
         for dtype in dtypes:
             for a_shape, b_shape in shapes:
                 rtol = 1e-2 if dtype == 'float32' else 1e-3
                 atol = 1e-4 if dtype == 'float32' else 1e-5
                 if rcond == "random":
                     rcond = onp.random.uniform(100, 200)
                 test_lstsq = TestLstsq(rcond)
                 if hybridize:
                     test_lstsq.hybridize()
                 a_np = onp.random.uniform(-10.0, 10.0, a_shape)
                 b_np = onp.random.uniform(-10.0, 10.0, b_shape)
                 a = np.array(a_np, dtype=dtype)
                 b = np.array(b_np, dtype=dtype)
                 x, residuals, rank, s = test_lstsq(a, b)
                 # check lstsq validity
                 check_lstsq(a_np, b_np, rcond, x, residuals, rank, s)


 @use_np
 def test_np_linalg_matrix_rank():
     class TestMatrixRank(HybridBlock):
         def __init__(self, hermitian):
             super(TestMatrixRank, self).__init__()
             self._hermitian = hermitian

         def forward(self, M, tol=None):
             return np.linalg.matrix_rank(M, tol, hermitian=self._hermitian)

     def check_matrix_rank(rank, a_np, tol, hermitian):
         try:
             rank_expected = onp.linalg.matrix_rank(a_np, tol=tol, hermitian=hermitian)
         except Exception as e:
             print("a:", a_np)
             print("a shape:", a_np.shape)
             print(e)
         else:
             if a_np.ndim < 2:
                 assert rank.shape == onp.asarray(rank_expected).shape
             else:
                 assert rank.shape == rank_expected.shape
             assert_almost_equal(rank.asnumpy(), rank_expected, rtol=rtol, atol=atol)

     shapes = [
         ((), ()),
         ((1,), (1,)),
         ((3,), (1,)),
         ((1, 1), ()),
         ((1, 1), (1,)),
         ((3, 3), (1,)),
         ((3, 4), (1,)),
         ((4, 3), ()),
         ((4, 3), (1,)),
         ((4, 3), (2,)),
         ((4, 3), (2, 3,)),
         ((2, 1, 1), ()),
         ((2, 1, 1), (1,)),
         ((2, 3, 3), (2,)),
         ((2, 3, 4), (1,)),
         ((2, 4, 3), (2,)),
         ((2, 3, 1, 1), ()),
         ((2, 3, 1, 1), (1, 1)),
         ((2, 3, 1, 1), (2, 1)),
         ((2, 3, 4, 4), (1, 3)),
         ((2, 3, 4, 5), (2, 1)),
         ((2, 3, 5, 4), (1, 3)),
         ((2, 3, 1, 1), (2, 3)),
         ((2, 3, 4, 4), (2, 3)),
         ((2, 3, 4, 5), (2, 3)),
         ((2, 3, 5, 4), (2, 3)),
     ]
     dtypes = ['float32', 'float64']
     for dtype in dtypes:
         for a_shape, tol_shape in shapes:
             for tol_is_none, hybridize in itertools.product([True, False], [True, False]):
                 rtol = 1e-3
                 atol = 1e-5
                 test_matrix_rank = TestMatrixRank(hermitian=False)
                 if hybridize:
                     test_matrix_rank.hybridize()

                 a_np = onp.asarray(onp.random.uniform(-10., 10., a_shape))
                 a = np.array(a_np, dtype=dtype)
                 if tol_is_none:
                     rank = test_matrix_rank(a)
                     # check matrix_rank validity
                     check_matrix_rank(rank, a.asnumpy(), tol=None, hermitian=False)
                 else:
                     tol_np = onp.random.uniform(10., 20., tol_shape)
                     tol = np.array(tol_np, dtype=dtype)
                     rank = test_matrix_rank(a, tol)
                     # check matrix_rank validity
                     check_matrix_rank(rank, a.asnumpy(), tol.asnumpy(), hermitian=False)


 @use_np
 @pytest.mark.parametrize('shape', [
     (),
     (1,),
     (0, 1, 2),
     (0, 1, 2),
     (0, 1, 2),
     (4, 5, 6, 7),
     (4, 5, 6, 7),
     (4, 5, 6, 7),
 ])
 def test_np_linalg_matrix_transpose(shape):
     class TestMatTranspose(HybridBlock):
         def __init__(self):
             super(TestMatTranspose, self).__init__()

         def forward(self, x):
             return np.linalg.matrix_transpose(x)

     data_np = onp.random.uniform(size=shape)
     data_mx = np.array(data_np, dtype=data_np.dtype)
     if data_mx.ndim < 2:
         assertRaises(ValueError, np.linalg.matrix_transpose, data_mx)
         return
     ret_np = onp.swapaxes(data_np, -1, -2)
     ret_mx = np.linalg.matrix_transpose(data_mx)
     assert same(ret_mx.asnumpy(), ret_np)

     net = TestMatTranspose()
     for hybrid in [False, True]:
         if hybrid:
             net.hybridize()
         ret_mx = net(data_mx)
         assert same(ret_mx.asnumpy(), ret_np)

     assert same(data_mx.mT.asnumpy(), ret_np)


 @use_np
 def test_np_linalg_pinv():
     class TestPinv(HybridBlock):
         def __init__(self, hermitian):
             super(TestPinv, self).__init__()
             self._hermitian = hermitian

         def forward(self, a, rcond=1e-15):
             return np.linalg.pinv(a, rcond, hermitian=self._hermitian)

     def check_pinv(x, a_np, rcond_np, hermitian, use_rcond):
         try:
             if use_rcond:
                 x_expected = onp.linalg.pinv(a_np, rcond_np, hermitian=hermitian)
             else:
                 x_expected = onp.linalg.pinv(a_np, hermitian=hermitian)
         except Exception as e:
             print("a:", a_np)
             print("a shape:", a_np.shape)
             if use_rcond:
                 print("rcond_np", rcond_np)
                 print("b rcond_np:", rcond_np.shape)
             print(e)
         else:
             assert x.shape == x_expected.shape
             assert_almost_equal(x.asnumpy(), x_expected, rtol=rtol, atol=atol)

     shapes = [
         ((1, 1), ()),
         ((5, 5), ()),
         ((5, 6), ()),
         ((6, 5), ()),
         ((2, 3, 3), (1,)),
         ((2, 3, 3), (2,)),
         ((2, 3, 4), (2,)),
         ((2, 4, 3), (1,)),
         ((4, 5, 6), ()),
         ((4, 5, 6), (1,)),
         ((4, 6, 5), (4,)),
         ((2, 2, 4, 3), (1,)),
         ((2, 2, 4, 3), (2,)),
         ((2, 2, 4, 3), (1, 1)),
         ((2, 2, 4, 3), (1, 2)),
         ((2, 2, 4, 3), (2, 1)),
         ((2, 2, 4, 3), (2, 2)),
         ((2, 2, 3, 4), (1,)),
         ((2, 2, 3, 4), (2,)),
         ((2, 2, 3, 4), (1, 1)),
         ((2, 2, 3, 4), (1, 2)),
         ((2, 2, 3, 4), (2, 1)),
         ((2, 2, 3, 4), (2, 2)),
     ]
     dtypes = ['float32', 'float64']
     for dtype in dtypes:
         for a_shape, rcond_shape in shapes:
             for use_rcond, hybridize in itertools.product([True, False], [True, False]):
                 rtol = 1e-2 if dtype == 'float32' else 1e-3
                 atol = 1e-4 if dtype == 'float32' else 1e-5
                 hermitian = False
                 test_pinv = TestPinv(hermitian)
                 if hybridize:
                     test_pinv.hybridize()

                 a_np = onp.random.uniform(-10.0, 10.0, a_shape)
                 a_np = onp.array(a_np, dtype=dtype)
                 rcond_np = onp.random.uniform(0., 0.1, rcond_shape)
                 rcond_np = onp.array(rcond_np, dtype=dtype)
                 a = np.array(a_np, dtype=dtype)
                 rcond = np.array(rcond_np, dtype=dtype)
                 if use_rcond:
                     mx_out = test_pinv(a, rcond)
                 else:
                     mx_out = test_pinv(a)

                 # check tensorsolve validity
                 check_pinv(mx_out, a.asnumpy(), rcond.asnumpy(), hermitian, use_rcond)


 @use_np
 def test_np_linalg_eigvals():
     class TestEigvals(HybridBlock):
         def __init__(self):
             super(TestEigvals, self).__init__()

         def forward(self, a):
             return np.linalg.eigvals(a)

     def check_eigvals(x, a_np):
         try:
             x_expected = onp.linalg.eigvals(a_np)
         except Exception as e:
             print("a:", a_np)
             print("a shape:", a_np.shape)
             print(e)
         else:
             assert x.shape == x_expected.shape
             if 0 not in x.shape:
                 n = int(onp.prod(x.shape[:-1])) if len(shape) > 1 else 1
                 x = x.reshape(n, -1)
                 x_expected = x_expected.reshape(n, -1)
                 for i in range(n):
                     x1 = onp.sort(x[i].asnumpy())
                     x2 = onp.sort(x_expected[i])
                     assert_almost_equal(x1, x2, rtol=rtol, atol=atol)

     shapes = [
         (0, 0),
         (1, 1),
         (3, 3),
         (5, 5),
         (1, 0, 0),
         (0, 4, 4),
         (1, 4, 4),
         (2, 4, 4),
         (5, 5, 5),
         (1, 1, 4, 4),
         (2, 3, 4, 4)
     ]
     dtypes = ['float32', 'float64', 'uint8', 'int8', 'int32', 'int64']
     UPLOs = ['L', 'U']
     for hybridize in [True, False]:
         for shape, dtype in itertools.product(shapes, dtypes):
             rtol = 1e-2 if dtype == 'float32' else 1e-3
             atol = 1e-4 if dtype == 'float32' else 1e-5
             test_eigvals = TestEigvals()
             if hybridize:
                 test_eigvals.hybridize()
             if 0 in shape:
                 a_np = onp.ones(shape)
             else:
                 if dtype == 'uint8' or dtype == 'int8' or dtype == 'int32' or dtype == 'int64':
                     n = int(onp.prod(shape[:-2])) if len(shape) > 2 else 1
                     a_np = onp.array([onp.diag(onp.random.randint(1, 10, size=shape[-1])) for i in range(n)]).reshape(shape)
                 else:
                     a_np = new_matrix_with_real_eigvals_nd(shape)
             a = np.array(a_np, dtype=dtype)
             # check eigvals validity
             mx_out = test_eigvals(a)
             check_eigvals(mx_out, a.asnumpy())

             # check imperative once again
             mx_out = test_eigvals(a)
             check_eigvals(mx_out, a.asnumpy())


 @use_np
 def test_np_linalg_eigvalsh():
     class TestEigvalsh(HybridBlock):
         def __init__(self, upper):
             super(TestEigvalsh, self).__init__()
             self._upper = upper

         def forward(self, a):
             return np.linalg.eigvalsh(a, upper=self._upper)

     def check_eigvalsh(w, a_np, upper):
         try:
             w_expected = onp.linalg.eigvalsh(a_np, upper)
         except Exception as e:
             print("a:", a_np)
             print("a shape:", a_np.shape)
             print(e)
         else:
             assert w.shape == w_expected.shape
             assert_almost_equal(w, w_expected, rtol=rtol, atol=atol)

     def new_matrix_from_sym_matrix_nd(sym_a, upper):
         shape = sym_a.shape
         if 0 in shape:
             return sym_a
         n = int(onp.prod(shape[:-2])) if len(shape) > 2 else 1
         a = sym_a.reshape(n, shape[-2], shape[-1])
         for idx in range(n):
             for i in range(shape[-2]):
                 for j in range(shape[-1]):
                     if ((upper == True and i > j) or (upper == False and i < j)):
                         a[idx][i][j] = onp.random.uniform(-10., 10.)
         return a.reshape(shape)

     shapes = [
         (0, 0),
         (1, 1),
         (2, 2),
         (3, 3),
         (5, 5),
         (1, 0, 0),
         (0, 4, 4),
         (1, 4, 4),
         (2, 4, 4),
         (5, 5, 5),
         (1, 1, 4, 4),
         (2, 3, 4, 4)
     ]
     dtypes = ['float32', 'float64', 'uint8', 'int8', 'int32', 'int64']
     uppers = [True, False]
     for hybridize in [True, False]:
         for shape, dtype, upper in itertools.product(shapes, dtypes, uppers):
             rtol = 1e-2 if dtype == 'float32' else 1e-3
             atol = 1e-4 if dtype == 'float32' else 1e-5
             test_eigvalsh = TestEigvalsh(upper)
             if hybridize:
                 test_eigvalsh.hybridize()
             if 0 in shape:
                 a_np = onp.ones(shape)
             else:
                 if dtype == 'uint8' or dtype == 'int8' or dtype == 'int32' or dtype == 'int64':
                     n = int(onp.prod(shape[:-2])) if len(shape) > 2 else 1
                     a_np = onp.array([onp.diag(onp.random.randint(1, 10, size=shape[-1])) for i in range(n)], dtype=dtype).reshape(shape)
                 else:
                     a_np = new_sym_matrix_with_real_eigvals_nd(shape)
                     a_np = new_matrix_from_sym_matrix_nd(a_np, upper)
             a = np.array(a_np, dtype=dtype)
             # check eigvalsh validity
             mx_out = test_eigvalsh(a)
             check_eigvalsh(mx_out, a.asnumpy(), upper)

             # check imperative once again
             mx_out = test_eigvalsh(a)
             check_eigvalsh(mx_out, a.asnumpy(), upper)


 @use_np
 def test_np_linalg_eig():
     class TestEig(HybridBlock):
         def __init__(self):
             super(TestEig, self).__init__()

         def forward(self, a):
             return np.linalg.eig(a)

     def check_eig(w, v, a_np):
         try:
             w_expected, v_expected = onp.linalg.eig(a_np)
         except Exception as e:
             print("a:", a_np)
             print("a shape:", a_np.shape)
             print(e)
         else:
             assert w.shape == w_expected.shape
             assert v.shape == v_expected.shape
             if 0 not in a_np.shape:
                 n = int(onp.prod(w.shape[:-1])) if len(shape) > 1 else 1
                 N = a_np.shape[-1]
                 w = w.reshape(n, N)
                 w_expected = w_expected.reshape(n, N)
                 v = v.reshape(n, N, N)
                 v_expected = v_expected.reshape(n, N, N)
                 a_np = a_np.reshape(n, N, N)
                 for i in range(n):
                     # check eigenvector
                     ai = a_np[i]
                     vi = (v[i].asnumpy()).T
                     wi = w[i].asnumpy()
                     for j in range(N):
                         assert_almost_equal(wi[j] * vi[j], onp.matmul(ai, vi[j]), rtol=rtol, atol=atol)

                     # check eigenvalues
                     w1 = onp.sort(w[i].asnumpy())
                     w2 = onp.sort(w_expected[i])
                     assert_almost_equal(w1, w2, rtol=rtol, atol=atol)

     shapes = [
         (0, 0),
         (1, 1),
         (3, 3),
         (5, 5),
         (1, 0, 0),
         (0, 4, 4),
         (1, 4, 4),
         (2, 4, 4),
         (5, 5, 5),
         (1, 1, 4, 4),
         (2, 3, 4, 4)
     ]
     dtypes = ['float32', 'float64', 'uint8', 'int8', 'int32', 'int64']
     for hybridize in [True, False]:
         for shape, dtype in itertools.product(shapes, dtypes):
             rtol = 1e-2 if dtype == 'float32' else 1e-3
             atol = 1e-4 if dtype == 'float32' else 1e-5
             test_eig = TestEig()
             if hybridize:
                 test_eig.hybridize()
             if 0 in shape:
                 a_np = onp.ones(shape)
             else:
                 if dtype == 'uint8' or dtype == 'int8' or dtype == 'int32' or dtype == 'int64':
                     n = int(onp.prod(shape[:-2])) if len(shape) > 2 else 1
                     a_np = onp.array([onp.diag(onp.random.randint(1, 10, size=shape[-1])) for i in range(n)]).reshape(shape)
                 else:
                     a_np = new_matrix_with_real_eigvals_nd(shape)
             a = np.array(a_np, dtype=dtype)
             # check eig validity
             mx_w, mx_v = test_eig(a)
             check_eig(mx_w, mx_v, a.asnumpy())

             # check imperative once again
             mx_w, mx_v = test_eig(a)
             check_eig(mx_w, mx_v, a.asnumpy())


 @use_np
 def test_np_linalg_eigh():
     class TestEigh(HybridBlock):
         def __init__(self, upper):
             super(TestEigh, self).__init__()
             self.upper = uppers

         def forward(self, a):
             return np.linalg.eigh(a, upper=self.upper)

     def check_eigh(w, v, a_np, upper):
         try:
             w_expected, v_expected = onp.linalg.eigh(a_np, upper)
         except Exception as e:
             print("a:", a_np)
             print("a shape:", a_np.shape)
             print(e)
         else:
             assert w.shape == w_expected.shape
             assert v.shape == v_expected.shape
             # check eigenvalues.
             assert_almost_equal(w, w_expected, rtol=rtol, atol=atol)
             # check eigenvectors.
             w_shape, v_shape, a_sym_np = get_sym_matrix_nd(a_np, upper)
             w_np = w.asnumpy()
             v_np = v.asnumpy()
             if 0 not in a_np.shape:
                 w_np = w_np.reshape(w_shape)
                 v_np = v_np.reshape(v_shape)
                 a_sym_np = a_sym_np.reshape(v_shape)
                 for i in range(w_shape[0]):
                     for j in range(w_shape[1]):
                         assert_almost_equal(onp.dot(a_sym_np[i], v_np[i][:, j]), w_np[i][j] * v_np[i][:, j], rtol=rtol, atol=atol)

     def get_sym_matrix_nd(a_np, upper):
         a_res_np = a_np
         shape = a_np.shape
         if 0 not in a_np.shape:
             n = int(onp.prod(shape[:-2])) if len(shape) > 2 else 1
             nrow, ncol = shape[-2], shape[-1]
             a_np = a_np.reshape(n, nrow, ncol)
             a_res_np = a_np
             for idx in range(n):
                 for i in range(nrow):
                     for j in range(ncol):
                         if ((upper == False and i < j) or (upper == True and i > j)):
                             a_res_np[idx][i][j] = a_np[idx][j][i]
             return (n, nrow), (n, nrow, ncol), a_res_np.reshape(shape)
         else :
             return (0, 0), (0, 0, 0), a_res_np.reshape(shape)

     def new_matrix_from_sym_matrix_nd(sym_a, upper):
         shape = sym_a.shape
         if 0 in shape:
             return sym_a
         n = int(onp.prod(shape[:-2])) if len(shape) > 2 else 1
         a = sym_a.reshape(n, shape[-2], shape[-1])
         for idx in range(n):
             for i in range(shape[-2]):
                 for j in range(shape[-1]):
                     if ((upper == True and i > j) or (upper == False and i < j)):
                         a[idx][i][j] = onp.random.uniform(-10., 10.)
         return a.reshape(shape)

     shapes = [
         (0, 0),
         (1, 1),
         (3, 3),
         (5, 5),
         (1, 0, 0),
         (0, 4, 4),
         (1, 4, 4),
         (2, 4, 4),
         (5, 5, 5),
         (1, 1, 4, 4),
         (2, 3, 4, 4)
     ]
     dtypes = ['float32', 'float64', 'uint8', 'int8', 'int32', 'int64']
     uppers = [True, False]
     for hybridize in [True, False]:
         for shape, dtype, upper in itertools.product(shapes, dtypes, uppers):
             rtol = 1e-2 if dtype == 'float32' else 1e-3
             atol = 1e-4 if dtype == 'float32' else 1e-5
             test_eigh = TestEigh(upper)
             if hybridize:
                 test_eigh.hybridize()
             if 0 in shape:
                 a_np = onp.ones(shape)
             else:
                 if dtype == 'uint8' or dtype == 'int8' or dtype == 'int32' or dtype == 'int64':
                     n = int(onp.prod(shape[:-2])) if len(shape) > 2 else 1
                     a_np = onp.array([onp.diag(onp.random.randint(1, 10, size=shape[-1])) for i in range(n)], dtype=dtype).reshape(shape)
                 else:
                     a_np = new_sym_matrix_with_real_eigvals_nd(shape)
                     a_np = new_matrix_from_sym_matrix_nd(a_np, upper)
             a = np.array(a_np, dtype=dtype)
             # check eigh validity
             w, v = test_eigh(a)
             check_eigh(w, v, a.asnumpy(), upper)

             # check imperative once again
             w, v = test_eigh(a)
             check_eigh(w, v, a.asnumpy(), upper)


 @use_np
 def test_np_linalg_det():
     class TestDet(HybridBlock):
         def __init__(self):
             super(TestDet, self).__init__()

         def forward(self, a):
             return np.linalg.det(a)

     # test non zero size input
     tensor_shapes = [
         (2, 0, 2, 2),
         (4, 4),
         (0, 2, 2, 2),
         (3, 3, 3),
         (0, 2, 2),
         (2, 2, 2, 2, 2),
         (1, 1),
     ]
     types = [onp.float32, onp.float64]
     grad_reqs = ['write', 'add', 'null']

     for hybridize, dtype, shape, grad_req in itertools.product([True, False], types, tensor_shapes, grad_reqs):
         a_shape = (1,) + shape
         test_det = TestDet()
         if hybridize:
             test_det.hybridize()
         a = rand_ndarray(shape=a_shape, dtype=dtype).as_np_ndarray()
         a.attach_grad(grad_req)
         np_out = onp.linalg.det(a.asnumpy())
         with mx.autograd.record():
             mx_out = test_det(a)
         assert mx_out.shape == np_out.shape
         assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-1, atol=1e-1)
         if grad_req != 'null':
             mx_out.backward()

         # Test imperative once again
         mx_out = np.linalg.det(a)
         np_out = onp.linalg.det(a.asnumpy())
         assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-1, atol=1e-1)

         # test numeric gradient
         a_sym = mx.sym.Variable("a").as_np_ndarray()
         mx_sym = mx.sym.np.linalg.det(a_sym).as_nd_ndarray()
         if 0 not in shape and grad_req != 'null':
             check_numeric_gradient(mx_sym, [a.as_nd_ndarray()], rtol=1e-1, atol=1e-1, dtype=dtype)


 @use_np
 @retry(3)
 @pytest.mark.parametrize('grad_req', ['write', 'add', 'null'])
 @pytest.mark.parametrize('dtype', [onp.float32, onp.float64])
 @pytest.mark.parametrize('hybridize', [True, False])
 @pytest.mark.parametrize('a_shape', [
     (2, 0, 2, 2),
     (5, 5),
     (0, 2, 2, 2),
     (3, 3, 3),
     (0, 3, 3),
     (2, 2, 2, 2, 2),
     (1, 1)
 ])
 @pytest.mark.xfail('win' in sys.platform, reason="Flaky test even with very high tolerance, tracked in #18184")
 def test_np_linalg_slogdet(a_shape, grad_req, dtype, hybridize):
     class TestSlogdet(HybridBlock):
         def __init__(self):
             super(TestSlogdet, self).__init__()

         def forward(self, a):
             return np.linalg.slogdet(a)

     test_slogdet = TestSlogdet()
     if hybridize:
         test_slogdet.hybridize()
     a = rand_ndarray(shape=a_shape, dtype=dtype).as_np_ndarray()
     a.attach_grad(grad_req)

     np_out = onp.linalg.slogdet(a.asnumpy())
     with mx.autograd.record():
         mx_out = test_slogdet(a)
     assert mx_out[0].shape == np_out[0].shape
     assert mx_out[1].shape == np_out[1].shape
     assert_almost_equal(mx_out[0].asnumpy(), np_out[0], rtol=1e-1, atol=1e-1)
     assert_almost_equal(mx_out[1].asnumpy(), np_out[1], rtol=1e-1, atol=1e-1)
     if grad_req != 'null':
         mx_out[1].backward()

     # Test imperative once again
     mx_out = np.linalg.slogdet(a)
     np_out = onp.linalg.slogdet(a.asnumpy())
     assert_almost_equal(mx_out[0].asnumpy(), np_out[0], rtol=1e-1, atol=1e-1)
     assert_almost_equal(mx_out[1].asnumpy(), np_out[1], rtol=1e-1, atol=1e-1)


 @use_np
 def test_np_vstack():
     class TestVstack(HybridBlock):
         def __init__(self):
             super(TestVstack, self).__init__()

         def forward(self, a, *args):
             return np.vstack([a] + list(args))

     def g(data):
         return onp.ones_like(data)

     configs = [
         ((), (), ()),
         ((2), (2), (2)),
         ((0), (0), (0)),
         ((2, 2), (3, 2), (0, 2)),
         ((2, 3), (1, 3), (4, 3)),
         ((2, 2, 2), (3, 2, 2), (1, 2, 2)),
         ((0, 1, 1), (4, 1, 1), (5, 1, 1)),
         ((2), (0, 2), (2, 2))
     ]
     types = ['float16', 'float32', 'float64', 'int8', 'int32', 'int64']
     for config in configs:
         for hybridize in [True, False]:
             for dtype in types:
                 test_vstack = TestVstack()
                 if hybridize:
                     test_vstack.hybridize()
                 rtol = 1e-3
                 atol = 1e-5
                 v = []
                 v_np = []
                 for i in range(3):
                     v_np.append(onp.array(onp.random.uniform(-10.0, 10.0, config[i]), dtype=dtype))
                     v.append(mx.nd.array(v_np[i]).as_np_ndarray())
                     v[i].attach_grad()
                 expected_np = onp.vstack(v_np)
                 with mx.autograd.record():
                     mx_out = test_vstack(*v)
                 assert mx_out.shape == expected_np.shape
                 assert_almost_equal(mx_out.asnumpy(), expected_np, rtol=rtol, atol=atol)

                 # Test gradient
                 mx_out.backward()
                 for i in range(3):
                     expected_grad = g(v_np[i])
                     assert_almost_equal(v[i].grad.asnumpy(), expected_grad, rtol=rtol, atol=atol)

                 # Test imperative once again
                 mx_out = np.vstack(v)
                 expected_np = onp.vstack(v_np)
                 assert_almost_equal(mx_out.asnumpy(), expected_np, rtol=rtol, atol=atol)


 @use_np
 def test_np_full():
     class TestFull(HybridBlock):
         def __init__(self, shape, dtype=None):
             super(TestFull, self).__init__()
             self._shape = shape
             self._dtype = dtype

         def forward(self, a):
             return np.full(self._shape, a, dtype=self._dtype)

     configs = [
         ((3, 4), 2.0),
         ((0, 3), 2.0),
         ((2, 3), True),
         ((3, 0), False),
         ((3, 4), np.array(2.0)),
         ((0, 3), np.array(2.0)),
         ((2, 3), np.array([1, 2, 3], dtype=np.float32)),
         ((2, 3), np.array([1, 2, 3], dtype=np.int64)),
         ((0, 3), np.array([1, 2, 3], dtype=np.float32)),
         ((0, 3), np.array([1, 2, 3], dtype=np.int64)),
     ]

     rtol, atol = 1e-3, 1e-5
     dtypes = ['float16', 'float32', 'float64', 'int8', 'int32', 'int64', 'bool']
     for shape, fill_value in configs:
         for hybridize in [True, False]:
             for dtype in dtypes:
                 if isinstance(fill_value, np.ndarray):
                     test_full = TestFull(shape, dtype=dtype)
                     if hybridize:
                         test_full.hybridize()
                     mx_out = test_full(fill_value)
                     expected_np = onp.full(shape, fill_value.asnumpy(), dtype=dtype)
                     assert mx_out.shape == expected_np.shape
                     assert mx_out.dtype == expected_np.dtype
                     assert_almost_equal(mx_out.asnumpy(), expected_np, rtol=rtol, atol=atol)

                 # Test imperative once again
                 mx_out = np.full(shape, fill_value, dtype=dtype)
                 if isinstance(fill_value, np.ndarray):
                     expected_np = onp.full(shape, fill_value.asnumpy(), dtype=dtype)
                 else:
                     expected_np = onp.full(shape, fill_value, dtype=dtype)
                 assert mx_out.shape == expected_np.shape
                 assert mx_out.dtype == expected_np.dtype
                 assert_almost_equal(mx_out.asnumpy(), expected_np, rtol=rtol, atol=atol)


 @use_np
 @pytest.mark.skip(reason='Skipped as the test is flaky and the feature causes curand error. Tracked in #18100')
 def test_np_full_like():
     class TestFullLike(HybridBlock):
         def __init__(self, fill_value, dtype, device):
             super(TestFullLike, self).__init__()
             self._fill_value = fill_value
             self._dtype = dtype
             self._device = device

         def forward(self, x, *args, **kwargs):
             return np.full_like(x, self._fill_value, dtype=self._dtype, device=self._device)

     if StrictVersion(platform.python_version()) < StrictVersion('3.0.0'):
         return

     dtypes = ['float64', 'float32', 'float16', 'int64', 'int32', 'int8', 'bool']
     shapes = [
         (),
         (1,),
         (4, 3),
         (4, 5),
         (2, 1),
         (6, 5, 6),
         (4, 2, 1, 2),
         (5, 1, 3, 3),
         (3, 3, 1, 0),
     ]
     # numpy.full_like operator in py2 cannot handle shape like (5, 0, 3) properly
     fill_values = [0, 1, 2, 3, 4, 5, 6, True, False]
     flags = [True, False]
     for fill_value, dtype, shape, hybridize in itertools.product(
         fill_values, dtypes, shapes, flags):
         param_dtype = onp.random.choice(dtypes)
         a = np.random.uniform(low=0, high=100, size=shape, dtype='float64').astype(dtype)
         test = TestFullLike(fill_value, param_dtype, npx.current_device())
         expected_ret = onp.full_like(a.asnumpy(), fill_value=fill_value, dtype=param_dtype)
         if hybridize:
             test.hybridize()
         ret = test(a)
         assert_almost_equal(ret.asnumpy(), expected_ret, rtol=1e-3, atol=1e-5)

         # check imperative again
         ret = np.full_like(a, fill_value, param_dtype)
         assert_almost_equal(ret.asnumpy(), expected_ret, rtol=1e-3, atol=1e-5)


 @use_np
 def test_np_roll():
     class TestRoll(HybridBlock):
         def __init__(self, shift=None, axis=None):
             super(TestRoll, self).__init__()
             self._shift = shift
             self._axis = axis

         def forward(self, x):
             return np.roll(x, shift=self._shift, axis=self._axis)

     dtypes = ['int32', 'int64', 'float16', 'float32', 'float64']
     configs = [
         ((), (3,), None),
         ((1,), (-3,), None),
         ((20,), (-3,), None),
         ((3,), (2,), 0),
         ((2, 3, 4), (12,), (1,)),
         ((2, 3, 4), (10, -10), (0, 1)),
         ((2, 3, 4, 5), (0, 1), (-1, 2)),
         ((2, 3, 0, 1), (0, 1), (-1, 2)),
         ((2, 3, 4, 5), 10, (0, 2)),
     ]
     i_dtype = {"float32" : onp.float32,
                "float64" : onp.float64
                }
     for dtype in dtypes:
         for config in configs:
             for hybridize in [False, True]:
                 shape, shift, axis = config[0], config[1], config[2]
                 x = rand_ndarray(shape=shape, dtype=dtype).as_np_ndarray()
                 net = TestRoll(shift=shift, axis=axis)
                 np_out = onp.roll(x.asnumpy(), shift=shift, axis=axis)
                 if hybridize:
                     net.hybridize()
                 x.attach_grad()
                 with mx.autograd.record():
                     mx_out = net(x)
                 assert mx_out.shape == np_out.shape
                 mx_out.backward()
                 assert same(mx_out.asnumpy(), np_out)
                 assert same(x.grad.shape, x.shape)
                 assert same(x.grad.asnumpy(), onp.ones(shape))

                 # test imperativen
                 np_out = onp.roll(x.asnumpy(), shift=shift, axis=axis)
                 mx_out = np.roll(x, shift=shift, axis=axis)
                 assert same(mx_out.asnumpy(), np_out)

                 # test numeric
                 if dtype in ['float32', 'float64'] and len(shape)> 0 and  onp.prod(shape) > 0:
                     x_sym = mx.sym.Variable("x").as_np_ndarray()
                     mx_sym = mx.sym.np.roll(x_sym, shift=shift, axis=axis).as_nd_ndarray()
                     check_numeric_gradient(mx_sym, [x.as_nd_ndarray()],
                                            numeric_eps=1e-3, rtol=1e-3, atol=1e-5, dtype=i_dtype[dtype])


 @use_np
 def test_np_trace():
     class TestTrace(HybridBlock):
         def __init__(self, axis1, axis2, offset):
             super(TestTrace, self).__init__()
             self._axis1 = axis1
             self._axis2 = axis2
             self._offset = offset

         def forward(self, data):
             return np.trace(data, axis1=self._axis1, axis2=self._axis2, offset=self._offset)

     def g(data, axis1, axis2, offset):
         idx = onp.indices(data.shape)
         ret = onp.zeros_like(data)
         ret[idx[axis1] + offset == idx[axis2]] = 1.0
         return ret

     shapes = [
         (3, 3),
         (3, 4),
         (0, 0),
         (3, 3, 3),
         (0, 0, 0),
         (2, 2, 4, 3),
         (2, 2, 4, 3),
         (2, 0, 3, 0),
         (2, 0, 2, 3)
     ]
     offsets = range(-5, 5)
     dtypes = ['int32', 'float16', 'float32', 'float64']
     for hybridize in [True, False]:
         for shape in shapes:
             ndim = len(shape)
             for axis1 in range(-ndim, ndim):
                 for axis2 in range(-ndim, ndim):
                     if (axis1 + ndim) % ndim != (axis2 + ndim) % ndim:
                         for offset in offsets:
                             for dtype in dtypes:
                                 if dtype == 'float16':
                                     rtol = atol = 1e-2
                                 else:
                                     rtol = atol = 1e-5
                                 test_trace = TestTrace(axis1, axis2, offset)
                                 if hybridize:
                                     test_trace.hybridize()
                                 data_np = onp.random.uniform(-10.0, 10.0, shape)
                                 data = mx.nd.array(data_np, dtype=dtype)
                                 data_np = data.asnumpy()
                                 data.attach_grad()
                                 expected_np = onp.trace(data_np, axis1=axis1, axis2=axis2, offset=offset)
                                 with mx.autograd.record():
                                     out_mx = test_trace(data.as_np_ndarray())
                                 assert out_mx.shape == expected_np.shape
                                 assert_almost_equal(out_mx.asnumpy(), expected_np, rtol=rtol, atol=atol)
                                 out_mx.backward()
                                 backward_expected = g(data_np, axis1=axis1, axis2=axis2, offset=offset)
                                 assert_almost_equal(data.grad.asnumpy(), backward_expected, rtol=rtol, atol=atol)

                                 # Test imperative once again
                                 data = mx.nd.array(data_np, dtype=dtype)
                                 out_mx = np.trace(data.as_np_ndarray(), axis1=axis1, axis2=axis2, offset=offset)
                                 assert_almost_equal(out_mx.asnumpy(), expected_np, rtol=rtol, atol=atol)

     # bad params
     params = [
         ([], 0, 1, 0),
         ([2], 0, 1, 0),
         ([3, 2, 2], 1, 1, 1),
         ([3, 2, 2], 0, -4, 1)
     ]
     for shape, axis1, axis2, offset in params:
         data_np = onp.random.uniform(-1.0, 1.0, shape)
         data_mx = mx.nd.array(data_np)
         try:
             output = np.trace(data_mx.as_np_ndarray(), axis1=axis1, axis2=axis2, offset=offset)
         except mx.base.MXNetError:
             continue
         assert False


 @use_np
 def test_np_windows():
     class TestWindows(HybridBlock):
         def __init__(self, func, M):
             super(TestWindows, self).__init__()
             self._func = func
             self._M = M

         def forward(self, x, *args, **kwargs):
             op = getattr(np, self._func)
             assert op is not None
             return x + op(M=self._M)

     configs = [-10, -3, -1, 0, 1, 6, 10, 20]
     dtypes = ['float32', 'float64']
     funcs = ['hanning', 'hamming', 'blackman']
     for config in configs:
         for dtype in dtypes:
             for func in funcs:
                 x = np.zeros(shape=(), dtype=dtype)
                 for hybridize in [False, True]:
                     np_func = getattr(onp, func)
                     mx_func = TestWindows(func, M=config)
                     np_out = np_func(M=config).astype(dtype)
                     if hybridize:
                         mx_func.hybridize()
                     mx_out = mx_func(x)
                     assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)
                     # test imperative
                     mx_out = getattr(np, func)(M=config)
                     np_out = np_func(M=config).astype(dtype)
                     assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)


 @use_np
 def test_np_flip():
     class TestFlip(HybridBlock):
         def __init__(self, axis):
             super(TestFlip, self).__init__()
             self.axis = axis

         def forward(self, x):
             return np.flip(x, self.axis)

     shapes = [(1, 2, 3), (1, 0), ()]
     types = ['int32', 'int64', 'float16', 'float32', 'float64']
     for hybridize in [True, False]:
         for oneType in types:
             rtol, atol=1e-3, 1e-5
             for shape in shapes:
                 axis = random.randint(-len(shape), len(shape))
                 if axis == len(shape):
                     axis = None
                 test_flip = TestFlip(axis)
                 if hybridize:
                     test_flip.hybridize()
                 x = rand_ndarray(shape, dtype=oneType).as_np_ndarray()
                 x.attach_grad()
                 np_out = onp.flip(x.asnumpy(), axis)
                 with mx.autograd.record():
                     mx_out = test_flip(x)
                 assert mx_out.shape == np_out.shape
                 assert_almost_equal(mx_out.asnumpy(), np_out, rtol=rtol, atol=atol)
                 mx_out.backward()
                 np_backward = onp.ones(np_out.shape)
                 assert_almost_equal(x.grad.asnumpy(), np_backward, rtol=rtol, atol=atol)

                 # Test imperative once again
                 mx_out = np.flip(x, axis)
                 np_out = onp.flip(x.asnumpy(), axis)
                 assert_almost_equal(mx_out.asnumpy(), np_out, rtol=rtol, atol=atol)


 @use_np
 def test_np_flipud_fliplr():
     class TestFlipud(HybridBlock):
         def __init__(self):
             super(TestFlipud, self).__init__()

         def forward(self, x):
             return np.flipud(x)

     class TestFliplr(HybridBlock):
         def __init__(self):
             super(TestFliplr, self).__init__()

         def forward(self, x):
             return np.fliplr(x)

     shapes = [(1, 2, 3), (1, 0)]
     types = ['int32', 'int64', 'float16', 'float32', 'float64']
     for func in ['flipud', 'fliplr']:
         for hybridize in [True, False]:
             for oneType in types:
                 rtol, atol=1e-3, 1e-5
                 for shape in shapes:
                     if func == 'flipud':
                         test_flip = TestFlipud()
                     else:
                         test_flip = TestFliplr()
                     if hybridize:
                         test_flip.hybridize()
                     x = rand_ndarray(shape, dtype=oneType).as_np_ndarray()
                     x.attach_grad()
                     if func == 'flipud':
                         np_out = onp.flipud(x.asnumpy())
                     else:
                         np_out = onp.fliplr(x.asnumpy())
                     with mx.autograd.record():
                         mx_out = test_flip(x)
                     assert mx_out.shape == np_out.shape
                     assert_almost_equal(mx_out.asnumpy(), np_out, rtol=rtol, atol=atol)
                     mx_out.backward()
                     np_backward = onp.ones(np_out.shape)
                     assert_almost_equal(x.grad.asnumpy(), np_backward, rtol=rtol, atol=atol)

                     # Test imperative once again
                     if func == 'flipud':
                         mx_out = np.flipud(x)
                         np_out = onp.flipud(x.asnumpy())
                     else:
                         mx_out = np.fliplr(x)
                         np_out = onp.fliplr(x.asnumpy())
                     assert_almost_equal(mx_out.asnumpy(), np_out, rtol=rtol, atol=atol)


 @use_np
 @pytest.mark.flaky
 def test_np_around():
     class TestAround(HybridBlock):
         def __init__(self, decimals):
             super(TestAround, self).__init__()
             self.decimals = decimals

         def forward(self, x):
             return np.around(x, self.decimals)

     shapes = [(), (1, 2, 3), (1, 0)]
     types = ['int32', 'int64', 'float32', 'float64']
     for hybridize in [True, False]:
         for oneType in types:
             rtol, atol = 1e-3, 1e-5
             for shape in shapes:
                 for d in range(-5, 6):
                     test_around = TestAround(d)
                     if hybridize:
                         test_around.hybridize()
                     x = rand_ndarray(shape, dtype=oneType).as_np_ndarray()
                     np_out = onp.around(x.asnumpy(), d)
                     mx_out = test_around(x)
                     assert mx_out.shape == np_out.shape
                     assert_almost_equal(mx_out.asnumpy(), np_out, rtol=rtol, atol=atol)

                     mx_out = np.around(x, d)
                     np_out = onp.around(x.asnumpy(), d)
                     assert_almost_equal(mx_out.asnumpy(), np_out, rtol=rtol, atol=atol)


 @use_np
 def test_np_flatnonzero():
     class TestFlatnonzero(HybridBlock):
         def __init__(self):
             super(TestFlatnonzero, self).__init__()

         def forward(self, a):
             return np.flatnonzero(a)

     shapes = [(1,), (4, 3), (4, 5), (2, 1), (6, 5, 6), (4, 2, 1, 2),
               (5, 1, 3, 3), (3, 3, 1, 0),]
     types = ['int32', 'int64', 'float32', 'float64']
     hybridizes = [True, False]
     for hybridize, oneType, shape in itertools.product(hybridizes, types, shapes):
         rtol, atol = 1e-3, 1e-5
         test_flatnonzero = TestFlatnonzero()
         if hybridize:
             test_flatnonzero.hybridize()
         x = rand_ndarray(shape, dtype=oneType).as_np_ndarray()
         np_out = onp.flatnonzero(x.asnumpy())
         mx_out = test_flatnonzero(x)
         assert mx_out.shape == np_out.shape
         assert_almost_equal(mx_out.asnumpy(), np_out, rtol=rtol, atol=atol)

         mx_out = np.flatnonzero(x)
         np_out = onp.flatnonzero(x.asnumpy())
         assert_almost_equal(mx_out.asnumpy(), np_out, rtol=rtol, atol=atol)


 @use_np
 def test_np_round():
     class TestRound(HybridBlock):
         def __init__(self, func, decimals):
             super(TestRound, self).__init__()
             self.func = func
             self.decimals = decimals

         def forward(self, x):
             return getattr(np, self.func)(x, self.decimals)

     shapes = [(), (1, 2, 3), (1, 0)]
     types = ['int32', 'int64', 'float32', 'float64']
     funcs = ['round', 'round_']
     for hybridize, oneType, func in itertools.product([True, False], types, funcs):
         rtol, atol = 1e-3, 1e-5
         for shape in shapes:
             for d in range(-5, 6):
                 test_round = TestRound(func, d)
                 if hybridize:
                     test_round.hybridize()
                 x = rand_ndarray(shape, dtype=oneType).as_np_ndarray()
                 np_out = getattr(onp, func)(x.asnumpy(), d)
                 mx_out = test_round(x)
                 assert mx_out.shape == np_out.shape
                 assert_almost_equal(mx_out.asnumpy(), np_out, rtol=rtol, atol=atol)

                 mx_out = getattr(mx.np, func)(x, d)
                 np_out = getattr(onp, func)(x.asnumpy(), d)
                 assert_almost_equal(mx_out.asnumpy(), np_out, rtol=rtol, atol=atol)


 @use_np
 def test_np_nonzero():
     class TestNonzero(HybridBlock):
         def __init__(self):
             super(TestNonzero, self).__init__()

         def forward(self, x):
             return npx.nonzero(x)

     types = ['int32', 'int64', 'float64', 'float32', 'float16']
     for hybridize in [True, False]:
         for shape in [(), (1, 2, 3), (1, 0)]:
             for oneType in types:
                 rtol, atol = 1e-3, 1e-5
                 test_nonzero = TestNonzero()
                 if hybridize:
                     test_nonzero.hybridize()
                 x = rand_ndarray(shape, dtype=oneType).as_np_ndarray()
                 np_out = onp.nonzero(x.asnumpy())
                 np_out = onp.transpose(np_out)
                 mx_out = test_nonzero(x)
                 assert mx_out.shape == np_out.shape
                 assert_almost_equal(mx_out.asnumpy(), np_out, rtol, atol)

                 # Test imperative once again
                 mx_out = npx.nonzero(x)
                 np_out = onp.nonzero(x.asnumpy())
                 np_out = onp.transpose(np_out)
                 assert_almost_equal(mx_out.asnumpy(), np_out, rtol, atol)


 @use_np
 def test_np_unique():
     class TestUnique(HybridBlock):
         def __init__(self, return_index=False, return_inverse=False, return_counts=False, axis=None):
             super(TestUnique, self).__init__()
             self._return_index = return_index
             self._return_inverse = return_inverse
             self._return_counts = return_counts
             self._axis = axis

         def forward(self, a):
             return np.unique(a, self._return_index, self._return_inverse, self._return_counts, self._axis)

     configs = [
         ((), True, True, True, None),
         ((1, ), True, True, True, -1),
         ((5, ), False, False, False, 0),
         ((5, ), True, False, False, 0),
         ((5, ), True, True, False, 0),
         ((5, ), True, True, True, 0),
         ((5, ), True, True, True, None),
         ((5, 4), True, True, True, None),
         ((5, 4), True, True, True, -1),
         ((5, 0, 4), True, True, True, None),
         ((0, 0, 0), True, True, True, None),
         # ((5, 3, 4), True, True, True, -1), # waiting for numpy 1.18, details in pr 14255
         ((5, 3, 4), True, True, True, None),
         ((5, 3, 4), True, True, True, 1),
     ]
     for dtype in ['float32', 'float64', 'int8', 'uint8', 'int32', 'int64']:
         for hybridize in [False, True]:
             for config in configs:
                 test_unique = TestUnique(*config[1:])
                 if hybridize:
                     test_unique.hybridize()
                 x = onp.random.uniform(-8.0, 8.0, size=config[0])
                 x = np.array(x, dtype=dtype)
                 np_out = onp.unique(x.asnumpy(), *config[1:])
                 mx_out = test_unique(x)
                 if (len(mx_out)) == 1:
                     assert mx_out.shape == np_out.shape
                     assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)
                 else:
                     for i in range(len(mx_out)):
                         assert mx_out[i].shape == np_out[i].shape
                         assert_almost_equal(mx_out[i].asnumpy(), np_out[i], rtol=1e-3, atol=1e-5)

                 # Test imperative once again
                 mx_out = np.unique(x, *config[1:])
                 np_out = onp.unique(x.asnumpy(), *config[1:])
                 if (len(mx_out)) == 1:
                     assert mx_out.shape == np_out.shape
                     assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)
                 else:
                     for i in range(len(mx_out)):
                         assert mx_out[i].shape == np_out[i].shape
                         assert_almost_equal(mx_out[i].asnumpy(), np_out[i], rtol=1e-3, atol=1e-5)


 @use_np
 @pytest.mark.parametrize('shape,index,inverse,counts', [
     ((), True, True, True),
     ((1, ), True, True, True),
     ((5, ), True, True, True),
     ((5, ), True, True, True),
     ((5, 4), True, True, True),
     ((5, 0, 4), True, True, True),
     ((0, 0, 0), True, True, True),
     ((5, 3, 4), True, True, True),
 ])
 @pytest.mark.parametrize('dtype', ['float32', 'float64', 'int8', 'uint8', 'int32', 'int64'])
 @pytest.mark.parametrize('hybridize', [False, True])
 def test_np_unique_all(shape, index, inverse, counts, dtype, hybridize):
     class TestUniqueAll(HybridBlock):
         def __init__(self):
             super(TestUniqueAll, self).__init__()

         def forward(self, a):
             return np.unique_all(a)

     test_unique = TestUniqueAll()
     if hybridize:
         test_unique.hybridize()
     x = onp.random.uniform(-8.0, 8.0, size=shape)
     x = np.array(x, dtype=dtype)
     np_out = onp.unique(x.asnumpy(), return_index=index, return_inverse=inverse, return_counts=counts)
     mx_out = test_unique(x)
     for i in range(len(mx_out)):
         assert mx_out[i].shape == np_out[i].shape
         assert_almost_equal(mx_out[i].asnumpy(), np_out[i], rtol=1e-3, atol=1e-5)

     # Test imperative once again
     mx_out = np.unique_all(x)
     np_out = onp.unique(x.asnumpy(), return_index=index, return_inverse=inverse, return_counts=counts)
     assert mx_out.values.shape == np_out[0].shape
     assert_almost_equal(mx_out.values.asnumpy(), np_out[0], rtol=1e-3, atol=1e-5)
     assert mx_out.indices.shape == np_out[1].shape
     assert_almost_equal(mx_out.indices.asnumpy(), np_out[1], rtol=1e-3, atol=1e-5)
     assert mx_out.inverse_indices.shape == np_out[2].shape
     assert_almost_equal(mx_out.inverse_indices.asnumpy(), np_out[2], rtol=1e-3, atol=1e-5)
     assert mx_out.counts.shape == np_out[3].shape
     assert_almost_equal(mx_out.counts.asnumpy(), np_out[3], rtol=1e-3, atol=1e-5)


 @use_np
 @pytest.mark.parametrize('shape,index,inverse,counts', [
     ((), False, True, False),
     ((1, ), False, True, False),
     ((5, ), False, True, False),
     ((5, ), False, True, False),
     ((5, 4), False, True, False),
     ((5, 0, 4), False, True, False),
     ((0, 0, 0), False, True, False),
     ((5, 3, 4), False, True, False),
 ])
 @pytest.mark.parametrize('dtype', ['float32', 'float64', 'int8', 'uint8', 'int32', 'int64'])
 @pytest.mark.parametrize('hybridize', [False, True])
 def test_np_unique_inverse(shape, index, inverse, counts, dtype, hybridize):
     class TestUniqueInverse(HybridBlock):
         def __init__(self):
             super(TestUniqueInverse, self).__init__()

         def forward(self, a):
             return np.unique_inverse(a)

     test_unique = TestUniqueInverse()
     if hybridize:
         test_unique.hybridize()
     x = onp.random.uniform(-8.0, 8.0, size=shape)
     x = np.array(x, dtype=dtype)
     np_out = onp.unique(x.asnumpy(), return_index=index, return_inverse=inverse, return_counts=counts)
     mx_out = test_unique(x)
     for i in range(len(mx_out)):
         assert mx_out[i].shape == np_out[i].shape
         assert_almost_equal(mx_out[i].asnumpy(), np_out[i], rtol=1e-3, atol=1e-5)

     # Test imperative once again
     mx_out = np.unique_inverse(x)
     np_out = onp.unique(x.asnumpy(), return_index=index, return_inverse=inverse, return_counts=counts)
     assert mx_out.values.shape == np_out[0].shape
     assert_almost_equal(mx_out.values.asnumpy(), np_out[0], rtol=1e-3, atol=1e-5)
     assert mx_out.inverse_indices.shape == np_out[1].shape
     assert_almost_equal(mx_out.inverse_indices.asnumpy(), np_out[1], rtol=1e-3, atol=1e-5)


 @use_np
 @pytest.mark.parametrize('shape,index,inverse,counts', [
     ((), False, False, False),
     ((1, ), False, False, False),
     ((5, ), False, False, False),
     ((5, ), False, False, False),
     ((5, 4), False, False, False),
     ((5, 0, 4), False, False, False),
     ((0, 0, 0), False, False, False),
     ((5, 3, 4), False, False, False),
 ])
 @pytest.mark.parametrize('dtype', ['float32', 'float64', 'int8', 'uint8', 'int32', 'int64'])
 @pytest.mark.parametrize('hybridize', [False, True])
 def test_np_unique_values(shape, index, inverse, counts, dtype, hybridize):
     class TestUniqueValues(HybridBlock):
         def __init__(self):
             super(TestUniqueValues, self).__init__()

         def forward(self, a):
             return np.unique_values(a)

     test_unique = TestUniqueValues()
     if hybridize:
         test_unique.hybridize()
     x = onp.random.uniform(-8.0, 8.0, size=shape)
     x = np.array(x, dtype=dtype)
     np_out = onp.unique(x.asnumpy(), return_index=index, return_inverse=inverse, return_counts=counts)
     mx_out = test_unique(x)
     assert mx_out.shape == np_out.shape
     assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)

     # Test imperative once again
     mx_out = np.unique_values(x)
     np_out = onp.unique(x.asnumpy(), return_index=index, return_inverse=inverse, return_counts=counts)
     assert mx_out.shape == np_out.shape
     assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)


 @use_np
 def test_np_take():
     configs = [
         ((4, 4), (4, 0), None),
         ((4, 4), (4, 0), 0),
         ((4, 4), (4, 0), 1),
         ((), (4, 0), None),
         ((), (5, ), None),
         ((), (4, 5), None),
         ((), (), None),
         ((3, 4), (), None),
         ((3, 4), (), 0),
         ((3, 4), (), 1),
         ((3, 4, 5), (), 2),
         ((3, 4, 5), (), -3),
     ]

     class TestTake(HybridBlock):
         def __init__(self, axis, mode):
             super(TestTake, self).__init__()
             self._axis = axis
             self._mode = mode

         def forward(self, a, indices):
             return np.take(a, indices, axis=self._axis, mode=self._mode)

     def grad_helper(grad_in, axis, idx, mode):
         k = 1 if axis == None else grad_in.shape[axis]
         if mode == 'clip':
             idx = 0 if idx < 0 else idx
             idx = k - 1 if idx >= k else idx
         else:
             idx = idx % k

         if axis == None:
             if grad_in.shape == ():
                 grad_in += 1.0
             else:
                 grad_in[idx] += 1.0
         elif axis == 0:
             if axis == len(grad_in.shape) - 1:
                 grad_in[idx] += 1.0
             else:
                 grad_in[idx, :] += 1.0
         elif axis == 1:
             if axis == len(grad_in.shape) - 1:
                 grad_in[:, idx] += 1.0
             else:
                 grad_in[:, idx, :] += 1.0
         elif axis == 2:
             if axis == len(grad_in.shape) - 1:
                 grad_in[:, :, idx] += 1.0
             else:
                 grad_in[:, :, idx, :] += 1.0
         elif axis == 3:
             if axis == len(grad_in.shape) - 1:
                 grad_in[:, :, :, idx] += 1.0
             else:
                 grad_in[:, :, :, idx, :] += 1.0
         elif axis == 4:
             grad_in[:, :, :, :, idx] += 1.0
         else:
             raise ValueError(f"axis {axis} is not supported...")

     def check_output_n_grad(data_shape, idx_shape, axis, mode):
         data_real = onp.random.normal(size=data_shape).astype('float32')
         idx_real = onp.random.randint(low=-100, high=100, size=idx_shape)

         assert same(np.take(np.array(data_real), np.array(idx_real), axis=axis, mode=mode).asnumpy(),
              onp.take(data_real, idx_real, axis=axis, mode=mode))

         grad_in = onp.zeros(data_shape, dtype='float32')

         test_take = TestTake(axis=axis, mode=mode)
         if hybridize:
             test_take.hybridize()
         x = np.array(data_real)
         x.attach_grad()
         with mx.autograd.record():
             mx_out = test_take(x, np.array(idx_real))
         assert same(mx_out.asnumpy(), onp.take(data_real, idx_real, axis=axis, mode=mode))

         if axis and axis < 0:
             axis += len(data_shape)

         if idx_real.size != 0:
             for i in onp.nditer(idx_real):
                 grad_helper(grad_in, axis, i, mode)


         mx_out.backward()
         same(x.grad.asnumpy(), grad_in)

     for hybridize in [True, False]:
         for mode in ['clip', 'wrap']:
             for data_ndim in range(1, 5):
                 for idx_ndim in range(1, 4):
                     for axis in range(-data_ndim, data_ndim):
                         data_shape = ()
                         for _ in range(data_ndim):
                             data_shape += (onp.random.randint(low=1, high=5), )
                         idx_shape = ()
                         for _ in range(idx_ndim):
                             idx_shape += (onp.random.randint(low=1, high=5), )
                         check_output_n_grad(data_shape, idx_shape, axis, mode)

             for config in configs:
                 check_output_n_grad(config[0], config[1], config[2], mode)


 def test_np_builtin_op_signature():
     import inspect
     from mxnet import _numpy_op_doc
     builtin_np_op_names = [name for name in get_all_registered_operators() if name.startswith('_np_')]
     for op_name in builtin_np_op_names:
         _op_from_doc = getattr(_numpy_op_doc, op_name, None)
         assert _op_from_doc is not None, "Failed to find documentation for operator {}. " \
                                          "Please add the documentation in _numpy_op_doc.py for this operator."\
             .format(op_name)
         op = _get_builtin_op(op_name)
         assert op is not None
         assert str(op.__signature__) == str(inspect.signature(_op_from_doc))


 @use_np
 def test_np_tril_indices():
     class TestTrilindices(HybridBlock):
         def __init__(self, n, k=0, m=None):
             super(TestTrilindices, self).__init__()
             self._n = n;
             self._k = k;
             if m is None:
                 m = n
             self._m = m

         def forward(self, x, *args, **kwargs):
             return x, np.tril_indices(n=self._n, k=self._k, m=self._m)

     for n in onp.random.random_integers(-10, 50, 2):
         for k in onp.random.random_integers(-50, 50, 2):
             for m in onp.random.random_integers(-10, 50, 2):
                 np_out = onp.tril_indices(n, k, m)
                 for hybridize in [True, False]:
                     # dummy nparray for hybridize
                     x = np.ones((1,1))
                     test_trilindices = TestTrilindices(int(n), int(k), int(m))
                     if hybridize:
                         test_trilindices.hybridize()
                     mx_out = test_trilindices(x)[1]
                     assert len(mx_out) == 2
                     assert same(mx_out[0], np_out[0])
                     assert same(mx_out[1], np_out[1])
                     if n > 0 and m > 0 and hybridize is False:
                         np_data = onp.arange(n*m).reshape(n, m)
                         mx_data = np.array(np_data)
                         np_data[np_out] = -10
                         mx_data[mx_out] = -10
                         assert same(np_data, mx_data.asnumpy())


 @use_np
 def test_np_fill_diagonal():
     class TestFillDiagonal(HybridBlock):
         def __init__(self, val, wrap=False):
             super(TestFillDiagonal, self).__init__()
             self._val = val
             self._wrap= wrap

         def forward(self, x):
             return np.fill_diagonal(x, val=self._val, wrap=self._wrap)

     configs = [
         ((10, 10), 2),
         ((10, 10), -2),
         ((4, 10), -2),
         ((10, 4), 2),
         ((10, 10), [-2, 2]),
         ((10, 10), [-2, 2]),
         ((10, 5), [-2, 2, -1, -3]),
         ((100, 50), [-2, 2, -1, -3]),
         ((1000, 500), [-2, 2, -1, -3]),
         ((5, 10), [-2, 2, -1, -3]),
         ((50, 100), [-2, 2, -1, -3]),
         ((500, 1000), [-2, 2, -1, -3]),
         ((4, 4, 4), 2),
         ((4, 4, 4, 4), 2),
         ((4, 4, 4, 4, 4), [-1, 2]),
         ((4, 4, 4, 4, 4, 4, 4, 4), 2),
         ((5, 5, 5, 5, 5, 5, 5, 5), [-1, 2, -2]),
         ((6, 6, 6, 6, 6, 6, 6, 6), 2),
         ((7, 7, 7, 7, 7, 7, 7, 7), [-1, 2, -2]),
     ]
     dtypes = ['int8', 'int32', 'int64', 'float16', 'float32', 'float64']
     for dtype in dtypes:
         for config in configs:
             for wrap in [False, True]:
                 np_data = onp.ones(config[0]).astype(dtype)
                 mx_data = np.array(np_data, dtype=dtype)
                 test_filldiagonal = TestFillDiagonal(config[1], wrap)
                 test_filldiagonal(mx_data)
                 onp.fill_diagonal(np_data, config[1], wrap)
                 assert same(np_data, mx_data.asnumpy())


 @use_np
 def test_np_moveaxis():
     class TestMoveaxis(HybridBlock):
         def __init__(self, source=None, destination=None):
             super(TestMoveaxis, self).__init__()
             self._source = source
             self._destination= destination

         def forward(self, x):
             return np.moveaxis(x, source=self._source, destination=self._destination)

     dtypes = ['int32', 'int64', 'float16', 'float32', 'float64']
     for hybridize in [False, True]:
         for dtype in dtypes:
             for ndim in [0, 1, 2, 3, 4, 5, 6]:
                 shape = rand_shape_nd(ndim, dim=5, allow_zero_size=True)
                 np_data = onp.random.uniform(low=-100, high=100, size=shape).astype(dtype)
                 mx_data = np.array(np_data, dtype=dtype)
                 axis = [i for i in range(ndim)]
                 random.shuffle(axis)
                 for i in range(ndim):
                     source = random.sample(axis, i)
                     destination = random.sample(axis, i)

                     # test gluon
                     test_moveaxis = TestMoveaxis(source,destination)
                     if hybridize:
                         test_moveaxis.hybridize()
                     np_out = onp.moveaxis(np_data, source=source, destination=destination)
                     mx_data.attach_grad()
                     with mx.autograd.record():
                         mx_out = test_moveaxis(mx_data)
                     assert mx_out.shape == np_out.shape
                     mx_out.backward()
                     assert same(mx_data.grad.shape, mx_data.shape)
                     assert same(mx_data.grad.asnumpy(), onp.ones(shape))
                     # test imperative
                     np_out = onp.moveaxis(np_data, source=source, destination=destination)
                     mx_out = np.moveaxis(mx_data, source=source, destination= destination)
                     assert np_out.dtype == mx_out.dtype
                     assert same(mx_out.asnumpy(), np_out)


 @use_np
 def test_np_rot90():
     class TestTRot90(HybridBlock):
         def __init__(self, k=1, axes=(0, 1)):
             super(TestTRot90, self).__init__()
             self._k = k
             self._axes = axes

         def forward(self, a, *args):
             return np.rot90(a, self._k, self._axes)

     configs = [
         ((2, 3), 1, (0, 1)),
         ((2, 3), 3, (0, 1)),
         ((2, 3), 1, (1, 0)),
         ((2, 3), 2, (1, 0)),
         ((2, 3), 3, (1, 0)),
         ((2, 3), 0, (1, 0)),
         ((2, 3, 4, 5), 3, (1, 2)),
         ((2, 3, 4, 5), -3, (2, 3)),
         ((2, 3, 0, 5), -2, (2, 3)),
         ((2, 0, 0, 5), -3, (2, 3)),
         ((2, 3, 0, 5), 0, (2, 1)),
     ]
     dtypes = ['uint8', 'int8', 'int32', 'int64', 'float16', 'float32', 'float64']

     for config in configs:
         for dtype in dtypes:
             for hybridize in [True, False]:
                 shape, k, axes = config[0], config[1], config[2]
                 x = rand_ndarray(shape=shape, dtype=dtype).as_np_ndarray()
                 net = TestTRot90(k=k, axes=axes)
                 if hybridize:
                     net.hybridize()

                 x.attach_grad()
                 np_out = onp.rot90(x.asnumpy(), k=k, axes=axes)
                 with mx.autograd.record():
                     mx_out = net(x)
                 assert mx_out.shape == np_out.shape
                 assert same(mx_out.asnumpy(), np_out)
                 mx_out.backward()
                 np_backward = onp.ones(shape, dtype)

                 assert same(x.grad.asnumpy().shape, np_backward.shape)
                 assert same(x.grad.asnumpy(), np_backward)

                 np_out = onp.rot90(x.asnumpy(), k=k, axes=axes)
                 mx_out = np.rot90(x, k=k, axes=axes)
                 assert same(mx_out.asnumpy(), np_out)


 @use_np
 def test_np_hsplit():
     class TestHSplit(HybridBlock):
         def __init__(self, indices_or_sections):
             super(TestHSplit, self).__init__()
             self._indices_or_sections = indices_or_sections

         def forward(self, a, *args, **kwargs):
             return np.hsplit(a, indices_or_sections=self._indices_or_sections)

     shapes = [
         (10,),
         (3, 8, 5),
         (3, 0, 5),
         (3, 8, 5, 6),
         (3, 0, 5, 6),
     ]
     indices_or_sections_num = [
         (2, 4),
         (3, 3),
         (3,),
         (1,),
         2,
     ]
     for hybridize in [True, False]:
         for shape in shapes:
             for indices_or_sections in indices_or_sections_num:
                 # test gluon
                 test_hsplit = TestHSplit(indices_or_sections=indices_or_sections)
                 if hybridize:
                     test_hsplit.hybridize()

                 a = mx.nd.random.uniform(-1.0, 1.0, shape=shape).as_np_ndarray()
                 a.attach_grad()
                 expected_ret = onp.hsplit(a.asnumpy(), indices_or_sections=indices_or_sections)
                 with mx.autograd.record():
                     y = test_hsplit(a)
                 assert len(y) == len(expected_ret)
                 for mx_out, np_out in zip(y, expected_ret):
                     assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)
                 mx.autograd.backward(y)
                 assert_almost_equal(a.grad.asnumpy(), onp.ones(a.shape), rtol=1e-3, atol=1e-5)

                 # test imperative
                 mx_outs = np.hsplit(a, indices_or_sections=indices_or_sections)
                 np_outs = onp.hsplit(a.asnumpy(), indices_or_sections=indices_or_sections)
                 for mx_out, np_out in zip(mx_outs, np_outs):
                     assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)


 @use_np
 def test_np_dsplit():
     class TestDSplit(HybridBlock):
         def __init__(self, indices_or_sections):
             super(TestDSplit, self).__init__()
             self._indices_or_sections = indices_or_sections

         def forward(self, a, *args, **kwargs):
             return np.dsplit(a, indices_or_sections=self._indices_or_sections)

     shapes = [
         (2, 4, 6),
         (3, 0, 6),
         (2, 3, 0, 4),
     ]
     indices_or_sections_num = [
         (2, 4),
         (3, 3),
         (3,),
         (1,),
         2,
     ]
     for hybridize in [True, False]:
         for shape in shapes:
             for indices_or_sections in indices_or_sections_num:
                 # test gluon
                 test_dsplit = TestDSplit(indices_or_sections=indices_or_sections)
                 if hybridize:
                     test_dsplit.hybridize()

                 a = mx.nd.random.uniform(-1.0, 1.0, shape=shape).as_np_ndarray()
                 a.attach_grad()
                 expected_ret = onp.dsplit(a.asnumpy(), indices_or_sections=indices_or_sections)
                 with mx.autograd.record():
                     y = test_dsplit(a)
                 assert len(y) == len(expected_ret)
                 for mx_out, np_out in zip(y, expected_ret):
                     assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)
                 mx.autograd.backward(y)
                 assert_almost_equal(a.grad.asnumpy(), onp.ones(a.shape), rtol=1e-3, atol=1e-5)

                 # test imperative
                 mx_outs = np.dsplit(a, indices_or_sections=indices_or_sections)
                 np_outs = onp.dsplit(a.asnumpy(), indices_or_sections=indices_or_sections)
                 for mx_out, np_out in zip(mx_outs, np_outs):
                     assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)


 @use_np
 def test_np_einsum():
     class TestEinsum(HybridBlock):
         def __init__(self, subscripts, optimize):
             super(TestEinsum, self).__init__()
             self.subscripts = subscripts
             self.optimize = optimize

         def forward(self, *operands):
             return np.einsum(self.subscripts, *operands, optimize=self.optimize)

     def dbg(name, data):
         print('type of {} = {}'.format(name, type(data)))
         print('shape of {} = {}'.format(name, data.shape))
         print('{} = {}'.format(name, data))

     configs = [
         ('ii', [(5, 5)], lambda *args: (onp.eye(5),)),
         ('ii->i', [(5, 5)], lambda *args: (onp.eye(5),)),
         ('ij->i', [(5, 5)], lambda *args: (onp.ones((5, 5)),)),
         ('...j->...', [(5, 5)], lambda *args: (onp.ones((5, 5)),)),
         ('ji', [(2, 3)], lambda *args: (onp.ones((2, 3)),)),
         ('ij->ji', [(2, 3)], lambda *args: (onp.ones((2, 3)),)),
         ('i, i', [(5,), (5,)], lambda *args: (args[1], args[0])),
         ('ij, j', [(5, 5), (5,)], lambda *args: (onp.tile(args[1][None, :], [5, 1]),
                                                  args[0].sum(axis=0))),
         ('...j, j', [(5, 5), (5,)], lambda *args: (onp.tile(args[1][None, :], [5, 1]),
                                                    onp.sum(args[0], axis=0))),
         ('..., ...', [(), (2, 3)], lambda *args: (onp.sum(args[1], axis=None),
                                                   args[0] * onp.ones((2, 3)))),
         (', ij', [(), (2, 3)], lambda *args: (onp.sum(args[1], axis=None),
                                               args[0] * onp.ones((2, 3)))),
         ('i, j', [(2,), (5, )], lambda *args: (onp.sum(args[1], axis=None) * onp.ones(2),
                                                onp.sum(args[0], axis=None) * onp.ones(5))),
         ('ijk, jil->kl', [(3, 4, 5), (4, 3, 2)], lambda *args: (onp.tile(onp.transpose(onp.sum(args[1],
                                                                                                axis=-1))[:, :, None],
                                                                          [1, 1, 5]),
                                                                 onp.tile(onp.transpose(onp.sum(args[0],
                                                                                                axis=-1))[:, :, None],
                                                                          [1, 1, 2]))),
         ('ii->i', [(3, 3)], lambda *args: (onp.eye(3),)),
         ('ki, jk->ij', [(3, 2), (4, 3)], lambda *args: (onp.tile(args[1].sum(axis=0)[:, None], [1, 2]),
                                                         onp.tile(args[0].sum(axis=1)[None, :], [4, 1]))),
         ('ki, ...k->i...', [(3, 2), (4, 3)], lambda *args: (onp.tile(args[1].sum(axis=0)[:, None], [1, 2]),
                                                             onp.tile(args[0].sum(axis=1)[None, :], [4, 1]))),
         ('k..., jk', [(3, 2), (4, 3)], lambda *args: (onp.tile(args[1].sum(axis=0)[:, None], [1, 2]),
                                                       onp.tile(args[0].sum(axis=1)[None, :], [4, 1]))),
         ('ij, jk', [(5, 0), (0, 4)], lambda *args: (onp.empty((5, 0)), onp.empty((0, 4)))),
         (('ij,jk,kl->il'), [(2, 2), (2, 5), (5, 2)], lambda *args: (onp.dot(onp.ones((2, 2)), onp.dot(args[1], args[2]).T),
                                                                     onp.dot(args[0].T, onp.dot(onp.ones((2, 2)), args[2].T)),
                                                                     onp.dot(onp.dot(args[0], args[1]).T, onp.ones((2, 2))))),
         # broadcast bug
         ('ij, ij -> i', [(1, 4), (2, 4)], lambda *args: (onp.sum(args[1], axis=0)[None, :],
                                                          onp.tile(args[0], [2, 1]))),
         # one dimensim bug
         ('...ij, ...jk -> ...ik', [(1, 4), (4, 2)], lambda *args: (args[1].sum(axis=1)[None, :],
                                                                    onp.tile(args[0].sum(axis=0)[: ,None], [1, 2]))),
         ('...ij, ...jk -> ...ik', [(2, 4), (4, 2)], lambda *args: (onp.tile(args[1].sum(axis=1)[None, :], [2, 1]),
                                                                    onp.tile(args[0].sum(axis=0)[: ,None], [1, 2]))),
         ('...ij, ...jk -> ...ik', [(3, 2, 1, 4), (3, 2, 4, 2)], lambda *args: (
                                                             args[1].sum(axis=3)[:, :, None, :],
                                                             onp.tile(args[0].sum(axis=2)[:, :, :, None], [1, 1, 1, 2]))),
         ('...ij, ...ik -> ...jk', [(1, 1, 1, 4), (1, 1, 1, 3)], lambda *args: (
                                                             onp.tile(args[1].sum(axis=3)[:, :, :, None], [1, 1, 1, 4]),
                                                             onp.tile(args[0].sum(axis=3)[:, :, : ,None], [1, 1, 1, 3]))),
         ('...ij, ...jc -> ...ic', [(1, 1, 5, 3), (1, 1, 3, 2)], lambda *args: (
                                                             onp.tile(args[1].sum(axis=3)[:, :, None, :], [1, 1, 5, 1]),
                                                             onp.tile(args[0].sum(axis=2)[:, :, : ,None], [1, 1, 1, 2]))),
         ('...ij, ...jc -> ...ic', [(1, 2, 5, 4), (1, 2, 4, 2)], lambda *args: (
                                                             onp.tile(args[1].sum(axis=3)[:, :, None, :], [1, 1, 5, 1]),
                                                             onp.tile(args[0].sum(axis=2)[:, :, : ,None], [1, 1, 1, 2]))),
         ('...ij, ...jc -> ...ic', [(2, 1, 5, 4), (2, 1, 4, 2)], lambda *args: (
                                                             onp.tile(args[1].sum(axis=3)[:, :, None, :], [1, 1, 5, 1]),
                                                              onp.tile(args[0].sum(axis=2)[:, :, : ,None], [1, 1, 1, 2]))),
         # issue #16576
         # commented due to long running time
         # ('abiz,abjz->abij', [(64, 8, 128, 512), (64, 8, 128, 512)], lambda *args: (onp.matmul(onp.ones((64, 8, 128, 128)), args[1]),
         #                                                                            onp.matmul(onp.ones((64, 8, 128, 128)), args[0]))),
     ]
     dtypes = ['float32', 'float64', 'int32']
     acc_type = {'float16': 'float32', 'float32': 'float64', 'float64': 'float64',
                 'int32': 'int64'}
     for hybridize in [False, True]:
         for dtype in dtypes:
             for config in configs:
                 for optimize in [False, True]:
                     rtol = 1e-2 if dtype == 'float16' else 1e-3
                     atol = 1e-4 if dtype == 'float16' else 1e-5
                     (subscripts, operands, get_grad) = config
                     test_einsum = TestEinsum(subscripts, optimize)
                     if hybridize:
                         test_einsum.hybridize()
                     x = []
                     x_np = []
                     for shape in operands:
                         tmp = onp.array(onp.random.uniform(-1.0, 1.0, shape), dtype=dtype)
                         x_np.append(tmp.astype(acc_type[dtype]))
                         x.append(np.array(tmp, dtype=dtype))
                         x[-1].attach_grad()
                     expected_np = onp.einsum(subscripts, *x_np, optimize=optimize).astype(dtype)
                     with mx.autograd.record():
                         out_mx = test_einsum(*x)
                     assert out_mx.shape == expected_np.shape
                     assert_almost_equal(out_mx.asnumpy(), expected_np, rtol=rtol, atol=atol)
                     out_mx.backward()
                     for (iop, op) in enumerate(x):
                         assert_almost_equal(op.grad.asnumpy(), get_grad(*x_np)[iop], rtol=rtol, atol=atol)

                     # Test imperative once again
                     for op in x:
                         op.attach_grad()
                     with mx.autograd.record():
                         out_mx = np.einsum(subscripts, *x, optimize=optimize)
                     out_mx.backward()
                     expected_np = onp.einsum(subscripts, *x_np, optimize=optimize)
                     assert_almost_equal(out_mx.asnumpy(), expected_np, rtol=rtol, atol=atol)
                     for (iop, op) in enumerate(x):
                         assert_almost_equal(op.grad.asnumpy(), get_grad(*x_np)[iop].astype(dtype), rtol=rtol, atol=atol)
     configs = [
         (('ij,jk,kl->il'), [(2, 2), (2, 5), (5, 2)]),
         (('ea,fb,abcd,gc,hd->efgh'), [(5, 5), (5, 5), (5, 5, 5, 5), (5, 5), (5, 5)]),
     ]
     dtypes = ['int32', 'float32', 'float64']
     for hybridize in [False, True]:
         for dtype in dtypes:
             for config in configs:
                 (subscripts, operands) = config
                 rtol = 1e-2 if dtype == 'float16' else 1e-3
                 atol = 1e-3 if dtype == 'float16' else 1e-4
                 grad = []
                 x_np = []
                 for shape in operands:
                     x_np.append(onp.array(onp.random.uniform(-2.0, 2.0, shape),
                                           dtype=dtype))
                 for optimize in [False, True]:
                     x = []
                     for iop in range(len(operands)):
                         x.append(np.array(x_np[iop], dtype=dtype))
                         x[-1].attach_grad()
                     test_einsum = TestEinsum(subscripts, optimize)
                     if hybridize:
                         test_einsum.hybridize()
                     expected_np = onp.einsum(subscripts, *[op.astype(acc_type[dtype]) for op in x_np],
                                              optimize=optimize).astype(dtype)
                     with mx.autograd.record():
                         out_mx = test_einsum(*x)
                     assert out_mx.shape == expected_np.shape
                     assert_almost_equal(out_mx.asnumpy(), expected_np, rtol=rtol, atol=atol)
                     out_mx.backward()
                     cur_grad = []
                     for op in x:
                         cur_grad.append(op.grad.asnumpy())
                     grad.append(cur_grad)
                 for iop in range(len(grad[0])):
                     assert_almost_equal(grad[0][iop], grad[1][iop], rtol=rtol, atol=atol)


 @use_np
 @pytest.mark.skip(reason='Skipped as the test is flaky and the feature causes curand error. Tracked in #18100')
 def test_np_diagflat():
     class TestDiagflat(HybridBlock):
         def __init__(self, k=0):
             super(TestDiagflat,self).__init__()
             self._k = k
         def forward(self, a):
             return np.diagflat(a, k=self._k)
     shapes = [(2,),5 , (1,5), (2,2), (2,5), (3,3), (4,3),(4,4,5)] # test_shapes, remember to include zero-dim shape and zero-size shapes
     dtypes = [np.int8, np.uint8, np.int32, np.int64, np.float16, np.float32, np.float64] # remember to include all meaningful data types for the operator
     range_k = 6
     for hybridize,shape,dtype, in itertools.product([False,True],shapes,dtypes):
         rtol = 1e-2 if dtype == np.float16 else 1e-3
         atol = 1e-4 if dtype == np.float16 else 1e-5

         for k in range(-range_k,range_k):
             test_diagflat = TestDiagflat(k)
             if hybridize:
                 test_diagflat.hybridize()

             x = np.random.uniform(-1.0,1.0, size = shape).astype(dtype)
             x.attach_grad()

             np_out = onp.diagflat(x.asnumpy(), k)
             with mx.autograd.record():
                 mx_out = test_diagflat(x)

             assert mx_out.shape == np_out.shape
             assert_almost_equal(mx_out.asnumpy(),np_out,rtol = rtol, atol = atol)

             mx_out.backward()
             # Code to get the reference backward value
             np_backward = np.ones(shape)
             assert_almost_equal(x.grad.asnumpy(), np_backward, rtol=rtol, atol=atol)

             # Test imperative once again
             mx_out = np.diagflat(x, k)
             np_out = onp.diagflat(x.asnumpy(), k)
             assert_almost_equal(mx_out.asnumpy(), np_out, rtol=rtol, atol=atol)


 @use_np
 def test_np_pad():
     class TestPad(HybridBlock):
         def __init__(self, pad_width, mode='constant'):
             super(TestPad,self).__init__()
             self._pad_width = pad_width
             self._mode = mode
         def forward(self, A, **kwargs):
             return np.pad(A, self._pad_width, mode=self._mode, **kwargs)

     shapes = [6, (1,5), (2,2), (2,2), (3,3), (2,3), (3,4,5)]
     dtypes = [np.int8, np.uint8, np.int32, np.int64, np.float16, np.float32, np.float64]
     mode = ['constant', 'reflect', 'symmetric', 'edge', 'minimum', 'maximum']
     for hybridize, shape, dtype, in itertools.product([False,True], shapes, dtypes):
         rtol = 1e-2 if dtype == np.float16 else 1e-3
         atol = 1e-4 if dtype == np.float16 else 1e-5

         for m in mode:
             x = np.random.uniform(-1.0, 1.0, size = shape).astype(dtype)
             pw = ()
             if (type(shape) == int):
                 pw += (2,3)
             else:
                 for _ in range(len(shape)):
                     pw += ((2,3),)
             test_pad = TestPad(pw, m)
             if hybridize:
                 test_pad.hybridize()
             x.attach_grad()

             if(m != 'constant'):
                 np_out = onp.pad(x.asnumpy(), pw, mode=m)
             else:
                 np_out = onp.pad(x.asnumpy(), pw, mode=m, constant_values=0)
             with mx.autograd.record():
                 mx_out = test_pad(x)

             # code to get the reference value
             assert mx_out.shape == np_out.shape
             assert_almost_equal(mx_out.asnumpy(), np_out, rtol = rtol, atol = atol)

             # test gradient
             if m == "constant":
                 device = mx.device.current_device()
                 x = mx.np.random.uniform(-1.0, 1.0, size=shape)
                 x = mx.np.array(x, device=device)
                 for grad_req in ['write', 'add']:
                     x.attach_grad(grad_req)
                     if grad_req == 'add':
                         init_grad = mx.np.random.uniform(-1.0, 1.0, size=shape, device=device)
                         x.grad[:] = init_grad
                     with mx.autograd.record():
                         mx_out = mx.np.pad(x, pad_width=pw, mode="constant")
                         out_grad = mx.np.random.normal(0, 1, mx_out.shape)
                         out_grad = mx.np.array(out_grad, device=device)
                         loss = mx_out * out_grad
                         loss = loss.sum()
                         loss.backward()
                     gt_in_grad = mx.np.pad(mx.np.ones_like(x.grad), pad_width=pw, mode="constant") * mx.np.array(out_grad, device=device)
                     mx_grad = x.grad
                     if grad_req == 'add':
                         assert_almost_equal(mx.np.pad(mx_grad - init_grad, pad_width=pw, mode="constant"), gt_in_grad.asnumpy(), rtol=rtol, atol=atol)
                     else:
                         assert_almost_equal(mx.np.pad(mx_grad, pad_width=pw, mode="constant"), gt_in_grad.asnumpy(), rtol=rtol, atol=atol)


 @use_np
 def test_np_rand():
     # Test shapes.
     shapes = [
         (3, 3),
         (3, 4),
         (0, 0),
         (3, 3, 3),
         (0, 0, 0),
         (2, 2, 4, 3),
         (2, 2, 4, 3),
         (2, 0, 3, 0),
         (2, 0, 2, 3)
     ]
     dtypes = ['float16', 'float32', 'float64']
     for dtype in dtypes:
         for shape in shapes:
             data_mx = np.random.rand(*shape, dtype=dtype)
             assert data_mx.shape == shape

     # Test random generator.
     device = mx.device.current_device()
     samples = 1000000
     trials = 8
     num_buckets = 10
     lower = 0.0
     upper = 1.0
     for dtype in ['float16', 'float32', 'float64']:
         buckets, probs = gen_buckets_probs_with_ppf(
             lambda x: ss.uniform.ppf(x, lower, upper), num_buckets)
         # Quantize bucket boundaries to reflect the actual dtype
         # and adjust probs accordingly
         buckets = np.array(buckets, dtype=dtype).tolist()
         probs = [(ss.uniform.cdf(buckets[i][1], lower, upper) -
                   ss.uniform.cdf(buckets[i][0], lower, upper))
                  for i in range(num_buckets)]

         def generator_mx(x): return np.random.rand(
             samples, device=device, dtype=dtype).asnumpy()
         verify_generator(generator=generator_mx, buckets=buckets,
                          probs=probs, nsamples=samples, nrepeat=trials)
         generator_mx_same_seed =\
             lambda x: onp.concatenate(
                 [np.random.rand(x // 10, device=device, dtype=dtype).asnumpy()
                     for _ in range(10)])
         verify_generator(generator=generator_mx_same_seed, buckets=buckets,
                          probs=probs, nsamples=samples, nrepeat=trials)


 @use_np
 def test_np_true_divide():
     shapes = [
         [()],
         [(0,)],
         [(2, 0, 3)],
         [(0, 0, 0)],
         [(10,)],
         [(3, 4)],
         [(2, 3, 4)],
         [(2, 3, 4, 5)],
         [(2, 3, 4, 5, 6)],
         [(0,), (0,)],
         [(0,), (1,)],
         [(2, 0, 3), (1, 1)],
         [(), (2, 3)],
         [(2, 3), ()],
         [(2, 3, 1), (1, 4)],
         [(2, 1, 4, 1), (3, 1, 5)],
     ]
     dtypes = [np.bool, np.int8, np.uint8, np.int32, np.int64, np.float16, np.float32, np.float64]
     itypes = [np.bool, np.int8, np.uint8, np.int32, np.int64]
     ftypes = [np.float16, np.float32, np.float64]
     for shape_pair, dtype in itertools.product(shapes, dtypes):
         a = np.random.uniform(3, 50, size=shape_pair[0]).astype(dtype)
         b = np.random.uniform(3, 50, size=shape_pair[-1]).astype(dtype)
         out_mx = a / b
         if onp.issubdtype(dtype, onp.integer) or (dtype is np.bool):
             assert out_mx.dtype == np.float32
         else:
             assert out_mx.dtype == dtype
         out_np = onp.true_divide(a.asnumpy(), b.asnumpy())
         assert_almost_equal(out_mx.asnumpy(), out_np, rtol=1e-3, atol=1e-3, use_broadcast=False)

         val = onp.random.randint(3, 50)
         out_mx = a / val
         out_np = onp.true_divide(a.asnumpy(), val)
         assert_almost_equal(out_mx.asnumpy(), out_np, rtol=1e-3, atol=1e-3, use_broadcast=False)

         out_mx = val / a
         out_np = onp.true_divide(val, a.asnumpy())
         assert_almost_equal(out_mx.asnumpy(), out_np, rtol=1e-3, atol=1e-3, use_broadcast=False)

     for shape_pair, itype, ftype in itertools.product(shapes, itypes, ftypes):
         i_ = np.random.uniform(3, 50, size=shape_pair[0]).astype(itype)
         f_ = np.random.uniform(3, 50, size=shape_pair[-1]).astype(ftype)

         out_mx = i_ / f_
         assert out_mx.dtype == ftype
         out_np = onp.true_divide(i_.asnumpy(), f_.asnumpy())
         assert_almost_equal(out_mx.asnumpy(), out_np, rtol=1e-3, atol=1e-3, use_broadcast=False)

         out_mx = f_ / i_
         assert out_mx.dtype == ftype
         out_np = onp.true_divide(f_.asnumpy(), i_.asnumpy())
         assert_almost_equal(out_mx.asnumpy(), out_np, rtol=1e-3, atol=1e-3, use_broadcast=False)


 @use_np
 def test_np_column_stack():
     class TestColumnStack(HybridBlock):
         def __init__(self):
             super(TestColumnStack, self).__init__()

         def forward(self, a, *args):
             return np.column_stack([a] + list(args))

     def g(data):
         return onp.ones_like(data)

     configs = [
         ((), (), ()),
         ((2), (2), (2)),
         ((0), (0), (0)),
         ((0, 3, 0), (0, 0, 0), (0, 1, 0)),
         ((2, 2), (2, 1), (2, 3)),
         ((4, 3), (4, 0), (4, 1)),
         ((2, 2, 2), (2, 4, 2), (2, 2, 2)),
         ((0, 1, 1), (0, 1, 1), (0, 1, 1))
     ]
     types = ['float16', 'float32', 'float64', 'int8', 'int32', 'int64']
     for config, hybridize, dtype in itertools.product(configs, [True, False], types):
         test_column_stack = TestColumnStack()
         if hybridize:
             test_column_stack.hybridize()
         rtol = 1e-3
         atol = 1e-5
         v = []
         v_np = []
         for i in range(3):
             v_np.append(onp.array(onp.random.uniform(-10.0, 10.0, config[i]), dtype=dtype))
             v.append(mx.nd.array(v_np[i]).as_np_ndarray())
             v[i].attach_grad()
         expected_np = onp.column_stack(v_np)
         with mx.autograd.record():
             mx_out = test_column_stack(*v)
         assert mx_out.shape == expected_np.shape
         assert_almost_equal(mx_out.asnumpy(), expected_np, rtol=rtol, atol=atol)

         # Test gradient
         mx_out.backward()
         for i in range(3):
             expected_grad = g(v_np[i])
             assert_almost_equal(v[i].grad.asnumpy(), expected_grad, rtol=rtol, atol=atol)

         # Test imperative once again
         mx_out = np.column_stack(v)
         expected_np = onp.column_stack(v_np)
         assert_almost_equal(mx_out.asnumpy(), expected_np, rtol=rtol, atol=atol)


 def test_npx_reshape():
     class TestNumpyXReshape(HybridBlock):
         def __init__(self, newshape, reverse):
             super(TestNumpyXReshape, self).__init__()
             self._newshape = newshape
             self._reverse = reverse

         def forward(self, a, *args, **kwargs):
             return npx.reshape(a, self._newshape, reverse=self._reverse)

     test_cases = [
         [(2, 3, 5, 5),  (-2, -1),         False, (2, 75)],
         [(2, 3, 5, 5),  (-2, -2, -1),     False, (2, 3, 25)],
         [(5, 3, 4, 5),  (-2, -1, -2),     False, (5, 15, 4)],
         [(2, 3, 5, 4),  (-1, -2, -2),     False, (8, 3, 5)],
         [(2, 3, 5, 5),  (-2, -2, -2, -2), False, (2, 3, 5, 5)],
         [(2, 1, 4, 5),  (-2, -3, -2, -2), False, (2, 4, 5)],
         [(1, 1, 4, 1),  (-3, -3, -2, -2), False, (4, 1)],
         [(1, 1, 1, 1),  (-3, -3, -3, -3), False, ()],
         [(2, 4, 5, 3),  (-1, 2, 2, 1),    False, (30, 2, 2, 1)],
         [(2, 3, 5, 6),  (-4,),            False, (2, 3, 5, 6)],
         [(2, 3, 5, 6),  (6, 1, -4),       False, (6, 1, 5, 6)],
         [(2, 3, 5, 6),  (-5, -5),         False, (6, 30)],
         [(2, 3, 5, 6),  (-5, -1),         False, (6, 30)],
         [(64,),         (-6, 16, 4),      False, (16, 4)],
         [(64,),         (-6, 16, -1),     False, (16, 4)],
         [(64, 1, 2, 3), (-6, 16, -1, -4), False, (16, 4, 1, 2, 3)],
         [(8, 5, 4, 6),  (-4, -1, 3, -6),  True,  (8, 5, 4, 2, 3)]
     ]
     for hybridize in [True, False]:
         for shape, newshape, reverse, expected_ret_shape in test_cases:
             for grad_req in ['write', 'add']:
                 # test gluon
                 test_reshape = TestNumpyXReshape(newshape=newshape, reverse=reverse)
                 if hybridize:
                     test_reshape.hybridize()

                 a = mx.np.random.uniform(-1, 1, shape).astype(np.float32)
                 init_a_grad = mx.np.random.uniform(-1, 1, shape).astype(np.float32)
                 a.attach_grad(grad_req=grad_req)
                 if grad_req == 'add':
                     a.grad[:] = init_a_grad
                 with mx.autograd.record():
                     y = test_reshape(a)
                 assert y.shape == expected_ret_shape,\
                     'y.shape={}, expected_ret_shape={}'.format(y.shape, expected_ret_shape)
                 assert_almost_equal(y.asnumpy(), a.asnumpy().reshape(expected_ret_shape), rtol=1e-3, atol=1e-5)

                 # test backward
                 mx.autograd.backward(y)
                 expected_grad = onp.ones(shape)
                 if grad_req == 'add':
                     expected_grad += init_a_grad.asnumpy()
                 assert_almost_equal(a.grad.asnumpy(), expected_grad, rtol=1e-3, atol=1e-5)

                 # test imperative
                 npx_out = npx.reshape(a, newshape, reverse=reverse)
                 expected_out = onp.reshape(a.asnumpy(), expected_ret_shape)
                 assert_almost_equal(npx_out.asnumpy(), expected_out, rtol=1e-3, atol=1e-5)


 @use_np
 def test_np_share_memory():
     ops = [np.shares_memory, np.may_share_memory]
     # reshape not support boolean types
     dtypes = [np.int8, np.uint8, np.int32, np.int64, np.float16, np.float32, np.float64]
     for op in ops:
         for dt in dtypes:
             x = np.zeros([13, 21, 23, 22], dtype=dt)
             assert not op(x[0,:,:,:], x[1,:,:,:])
             assert not op(x[2,:,:,:], x[3,:,:,:])
             assert not op(x[2:5,0,0,0], x[3:4,0,0,0])
             assert not op(x[2:5,0,0,0], x[4:7,0,0,0])
             assert op(x[0,0,0,2:5], x[0,0,0,3:4])
             assert op(x[0,6,0,2:5], x[0,6,0,4:7])
             assert not op(x[0,5,0,2:5], x[0,6,0,4:7])

             for adt in dtypes:
                 assert not op(x, np.ones((5, 0), dtype=adt))
                 assert not op(np.ones((5, 0), dtype=adt), x)
                 assert not op(np.ones((5, 0), dtype=dt), np.ones((0, 3, 0), dtype=adt))


 def test_np_median():
     class TestMedian(HybridBlock):
         def __init__(self, axis=None, keepdims=False):
             super(TestMedian, self).__init__()
             self._axis = axis
             self._keepdims = keepdims

         def forward(self, a):
             return np.median(a, axis=self._axis, keepdims=self._keepdims)

     flags = [True, False]
     dtypes = ['float16', 'float32', 'float64']
     qtypes = ['float32', 'float64']
     tensor_shapes = [
         ((2, 3), None),
         ((2, 3, 4, 5), 3),
         ((2, 3, 4), (0, 2)),
         ((2, 3, 4), 1)
     ]

     for hybridize, keepdims, (a_shape, axis), dtype in \
         itertools.product(flags, flags, tensor_shapes, dtypes):
         atol = 3e-4 if dtype == 'float16' else 1e-4
         rtol = 3e-2 if dtype == 'float16' else 1e-2
         test_median = TestMedian(axis=axis, keepdims=keepdims)
         if hybridize:
             test_median.hybridize()
         a = np.random.uniform(-1.0, 1.0, size=a_shape)
         np_out = onp.median(a.asnumpy(), axis=axis, keepdims=keepdims)
         mx_out = test_median(a)

         assert mx_out.shape == np_out.shape
         assert_almost_equal(mx_out.asnumpy(), np_out, atol=atol, rtol=rtol)

         mx_out = np.median(a, axis=axis, keepdims=keepdims)
         np_out = onp.median(a.asnumpy(), axis=axis, keepdims=keepdims)

         assert_almost_equal(mx_out.asnumpy(), np_out, atol=atol, rtol=rtol)


 @use_np
 def test_np_quantile():
     class TestQuantile(HybridBlock):
         def __init__(self, axis=None, interpolation='linear', keepdims=False):
             super(TestQuantile, self).__init__()
             self._axis = axis
             self._interpolation = interpolation
             self._keepdims = keepdims

         def forward(self, a, q):
             return np.quantile(a, q, axis=self._axis, interpolation=self._interpolation, keepdims=self._keepdims)

     class TestQuantileScalar(HybridBlock):
         def __init__(self, q=None, axis=None, interpolation='linear', keepdims=False):
             super(TestQuantileScalar, self).__init__()
             self._q = q
             self._axis = axis
             self._interpolation = interpolation
             self._keepdims = keepdims

         def forward(self, a):
             return np.quantile(a, self._q, axis=self._axis, interpolation=self._interpolation, keepdims=self._keepdims)

     flags = [True, False]
     interpolation_options = ['linear', 'lower', 'higher', 'nearest', 'midpoint']
     dtypes = [np.int32, np.int64, np.float16, np.float32, np.float64]
     qtypes = [np.float32, np.float64]
     tensor_shapes = [
         ((2, 3), (), None),
         ((2, 3, 4, 5), (), 3),
         ((2, 3, 4), (3,), (0, 2)),
         ((2, 3, 4), (3,), 1)
     ]
     for hybridize, keepdims, q_scalar, (a_shape, q_shape, axis), interpolation, dtype in \
         itertools.product(flags, flags, flags, tensor_shapes, interpolation_options, dtypes):
         if dtype == np.float16 and interpolation == 'linear': continue
         atol = 3e-4 if dtype == np.float16 else 1e-4
         rtol = 3e-2 if dtype == np.float16 else 1e-2
         a = np.random.uniform(-10.0, 10.0, size=a_shape).astype(dtype)
         qtype = random.choice(qtypes)
         q = np.random.uniform(0, 1.0, size=q_shape).astype(qtype)
         np_q = q.asnumpy()
         if q_scalar and q_shape == ():
             q = q.item()
             np_q = q
             test_quantile = TestQuantileScalar(q=q, axis=axis, interpolation=interpolation, keepdims=keepdims)
         else:
             test_quantile = TestQuantile(axis=axis, interpolation=interpolation, keepdims=keepdims)
         if hybridize:
             test_quantile.hybridize()
         mx_out = test_quantile(a) if (q_scalar and q_shape == ()) else test_quantile(a, q)
         np_out = onp.quantile(a.asnumpy(), np_q, axis=axis, interpolation=interpolation, keepdims=keepdims)
         assert mx_out.shape == np_out.shape
         assert_almost_equal(mx_out.asnumpy(), np_out, atol=atol, rtol=rtol)

         mx_out = np.quantile(a, q, axis=axis, interpolation=interpolation, keepdims=keepdims)
         np_out = onp.quantile(a.asnumpy(), np_q, axis=axis, interpolation=interpolation, keepdims=keepdims)
         assert_almost_equal(mx_out.asnumpy(), np_out, atol=atol, rtol=rtol)


 @use_np
 def test_np_percentile():
     class TestPercentile(HybridBlock):
         def __init__(self, axis=None, interpolation='linear', keepdims=False):
             super(TestPercentile, self).__init__()
             self._axis = axis
             self._interpolation = interpolation
             self._keepdims = keepdims

         def forward(self, a, q):
             return np.percentile(a, q, axis=self._axis, interpolation=self._interpolation, keepdims=self._keepdims)

     class TestPercentileScalar(HybridBlock):
         def __init__(self, q=None, axis=None, interpolation='linear', keepdims=False):
             super(TestPercentileScalar, self).__init__()
             self._q = q
             self._axis = axis
             self._interpolation = interpolation
             self._keepdims = keepdims

         def forward(self, a):
             return np.percentile(a, self._q, axis=self._axis, interpolation=self._interpolation, keepdims=self._keepdims)

     flags = [True, False]
     interpolation_options = ['linear', 'lower', 'higher', 'nearest', 'midpoint']
     dtypes = [np.int32, np.int64, np.float16, np.float32, np.float64]
     qtypes = [np.float32, np.float64]
     tensor_shapes = [
         ((2, 3), (), None),
         ((2, 3, 4, 5), (), 3),
         ((2, 3, 4, 5), (), (0, 1, 2)),
         ((2, 3, 4, 5), (), (-1, -2)),
         ((2, 3, 4), (3,), (0, 2)),
         ((2, 3, 4), (3,), 1)
     ]
     for hybridize, keepdims, q_scalar, (a_shape, q_shape, axis), interpolation, dtype in \
         itertools.product(flags, flags, flags, tensor_shapes, interpolation_options, dtypes):
         if dtype == np.float16 and interpolation == 'linear': continue
         atol = 3e-4 if dtype == np.float16 else 1e-4
         rtol = 3e-2 if dtype == np.float16 else 1e-2
         a = np.random.uniform(-10.0, 10.0, size=a_shape).astype(dtype)
         qtype = random.choice(qtypes)
         q = np.random.uniform(0, 1.0, size=q_shape).astype(qtype)
         np_q = q.asnumpy()
         if q_scalar and q_shape == ():
             q = q.item()
             np_q = q
             test_percentile = TestPercentileScalar(q=q, axis=axis, interpolation=interpolation, keepdims=keepdims)
         else:
             test_percentile = TestPercentile(axis=axis, interpolation=interpolation, keepdims=keepdims)
         if hybridize:
             test_percentile.hybridize()
         mx_out = test_percentile(a) if (q_scalar and q_shape == ()) else test_percentile(a, q)
         np_out = onp.percentile(a.asnumpy(), np_q, axis=axis, interpolation=interpolation, keepdims=keepdims)
         assert mx_out.shape == np_out.shape
         assert_almost_equal(mx_out.asnumpy(), np_out, atol=atol, rtol=rtol)

         mx_out = np.percentile(a, q, axis=axis, interpolation=interpolation, keepdims=keepdims)
         np_out = onp.percentile(a.asnumpy(), np_q, axis=axis, interpolation=interpolation, keepdims=keepdims)
         assert_almost_equal(mx_out.asnumpy(), np_out, atol=atol, rtol=rtol)


 @use_np
 def test_np_diff():
     def np_diff_backward(ograd, n, axis):
         res = ograd
         for _ in range(n):
             res = onp.negative(onp.diff(res, n=1, axis=axis, prepend=0, append=0))
         return res

     class TestDiff(HybridBlock):
         def __init__(self, n=1, axis=-1):
             super(TestDiff, self).__init__()
             self._n = n
             self._axis = axis

         def forward(self, a):
             return np.diff(a, n=self._n, axis=self._axis)

     shapes = [tuple(random.randrange(10) for i in range(random.randrange(6))) for j in range(5)]
     for hybridize in [True, False]:
         for shape in shapes:
             for axis in [i for i in range(-len(shape), len(shape))]:
                 for n in [i for i in range(0, shape[axis]+1)]:
                     test_np_diff = TestDiff(n=n, axis=axis)
                     if hybridize:
                         test_np_diff.hybridize()
                     for itype in [onp.float16, onp.float32, onp.float64]:
                         # note the tolerance shall be scaled by the input n
                         if itype == onp.float16:
                             rtol = atol = 1e-2*len(shape)*n
                         else:
                             rtol = atol = 1e-5*len(shape)*n
                         x = rand_ndarray(shape).astype(itype).as_np_ndarray()
                         x.attach_grad()
                         np_out = onp.diff(x.asnumpy(), n=n, axis=axis)
                         with mx.autograd.record():
                             mx_out = test_np_diff(x)
                         assert mx_out.shape == np_out.shape
                         assert_almost_equal(mx_out.asnumpy(), np_out, rtol=rtol, atol=atol)
                         mx_out.backward()
                         if (np_out.size == 0):
                             np_backward = onp.zeros(shape)
                         else:
                             np_backward = np_diff_backward(onp.ones(np_out.shape, dtype=itype), n=n, axis=axis)
                         assert x.grad.shape == np_backward.shape
                         assert_almost_equal(x.grad.asnumpy(), np_backward, rtol=rtol, atol=atol)

                         mx_out = np.diff(x, n=n, axis=axis)
                         np_out = onp.diff(x.asnumpy(), n=n, axis=axis)
                         assert_almost_equal(mx_out.asnumpy(), np_out, rtol=rtol, atol=atol)


 @use_np
 def test_np_ediff1d():
     def np_diff_backward(size, shape):
         if size <= 1:
             return onp.zeros(shape)
         else:
             ret = onp.ones(size - 1)
             return onp.negative(onp.diff(ret, n=1, axis=-1, prepend=0, append=0)).reshape(shape)

     # case 1: when both `to_begin` and `to_end` are arrays
     class TestEDiff1DCASE1(HybridBlock):
         def __init__(self):
             super(TestEDiff1DCASE1, self).__init__()

         def forward(self, a, b, c):
             return np.ediff1d(a, to_end=b, to_begin=c)

     # case 2: only `to_end` is array but `to_begin` is scalar/None
     class TestEDiff1DCASE2(HybridBlock):
         def __init__(self, to_begin=None):
             super(TestEDiff1DCASE2, self).__init__()
             self._to_begin = to_begin

         def forward(self, a, b):
             return np.ediff1d(a, to_end=b, to_begin=self._to_begin)

     # case 3: only `to_begin` is array but `to_end` is scalar/None
     class TestEDiff1DCASE3(HybridBlock):
         def __init__(self, to_end=None):
             super(TestEDiff1DCASE3, self).__init__()
             self._to_end = to_end

         def forward(self, a, b):
             return np.ediff1d(a, to_end=self._to_end, to_begin=b)

     # case 4: both `to_begin` and `to_end` are scalar/None
     class TestEDiff1DCASE4(HybridBlock):
         def __init__(self, to_end=None, to_begin=None):
             super(TestEDiff1DCASE4, self).__init__()
             self._to_begin = to_begin
             self._to_end = to_end

         def forward(self, a):
             return np.ediff1d(a, to_end=self._to_end, to_begin=self._to_begin)

     rtol = 1e-3
     atol = 1e-5
     mapper = {(True, True): TestEDiff1DCASE1,
               (False, True): TestEDiff1DCASE2,
               (True, False): TestEDiff1DCASE3,
               (False, False): TestEDiff1DCASE4}
     hybridize_list = [True, False]
     shape_list = [(), (1,), (2, 3), 6, (7, 8), 10, (4, 0, 5)]
     # dtype_list = [np.int32, np.int64, np.float16, np.float32, np.float64]
     dtype_list = [np.float16, np.float32, np.float64]
     append_list = [1, 2, None, (1, 2, 4), (4, 3), (), (5, 0), (6)]

     for hybridize, dtype, shape, to_begin, to_end in itertools.product(hybridize_list, dtype_list,
                 shape_list, append_list, append_list):
         mx_arr = np.random.randint(5, size=shape).astype(dtype)
         np_arr = mx_arr.asnumpy()
         kwargs = {}
         mx_args = [mx_arr]
         np_args = [np_arr]
         mx_args_imperative = [mx_arr]

         if isinstance(to_end, tuple):
             to_end = np.random.randint(5, size=to_end).astype(dtype)
             mx_args.append(to_end)
             np_args.append(to_end.asnumpy())
         else:
             kwargs["to_end"] = to_end
             np_args.append(to_end)
         mx_args_imperative.append(to_end)

         if isinstance(to_begin, tuple):
             to_begin = np.random.randint(5, size=to_begin).astype(dtype)
             mx_args.append(to_begin)
             np_args.append(to_begin.asnumpy())
         else:
             kwargs["to_begin"] = to_begin
             np_args.append(to_begin)
         mx_args_imperative.append(to_begin)

         from mxnet.numpy import ndarray as np_ndarray
         input_type = (isinstance(to_begin, np_ndarray), isinstance(to_end, np_ndarray))
         test_np_ediff1d = mapper[input_type](**kwargs)

         if hybridize:
             test_np_ediff1d.hybridize()

         np_out = onp.ediff1d(*np_args)
         for arg in mx_args:
             arg.attach_grad()

         with mx.autograd.record():
             mx_out = test_np_ediff1d(*mx_args)
         assert mx_out.shape == np_out.shape
         assert_almost_equal(mx_out.asnumpy(), np_out, atol=atol, rtol=rtol)
         # test imperative
         mx_out_imperative = np.ediff1d(*mx_args_imperative)
         assert mx_out_imperative.shape == np_out.shape
         assert_almost_equal(mx_out_imperative.asnumpy(), np_out, atol=atol, rtol=rtol)

         mx_out.backward()
         if dtype in [np.float16, np.float32, np.float64]:
             for idx, arg in enumerate(mx_args):
                 if idx == 0:
                     assert_almost_equal(arg.grad.asnumpy(), np_diff_backward(arg.size, arg.shape), atol=atol, rtol=rtol)
                 else:
                     assert_almost_equal(arg.grad.asnumpy(), np.ones_like(arg), atol=atol, rtol=rtol)


 @use_np
 def test_np_column_stack():
     class TestColumnStack(HybridBlock):
         def __init__(self):
             super(TestColumnStack, self).__init__()

         def forward(self, a, *args):
             return np.column_stack([a] + list(args))

     def g(data):
         return onp.ones_like(data)

     configs = [
         ((), (), ()),
         ((2), (2), (2)),
         ((0), (0), (0)),
         ((0, 3, 0), (0, 0, 0), (0, 1, 0)),
         ((2, 2), (2, 1), (2, 3)),
         ((4, 3), (4, 0), (4, 1)),
         ((2, 2, 2), (2, 4, 2), (2, 2, 2)),
         ((0, 1, 1), (0, 1, 1), (0, 1, 1))
     ]
     types = ['float16', 'float32', 'float64', 'int8', 'int32', 'int64']
     for config, hybridize, dtype in itertools.product(configs, [True, False], types):
         test_column_stack = TestColumnStack()
         if hybridize:
             test_column_stack.hybridize()
         rtol = 1e-3
         atol = 1e-5
         v = []
         v_np = []
         for i in range(3):
             v_np.append(onp.array(onp.random.uniform(-10.0, 10.0, config[i]), dtype=dtype))
             v.append(mx.nd.array(v_np[i]).as_np_ndarray())
             v[i].attach_grad()
         expected_np = onp.column_stack(v_np)
         with mx.autograd.record():
             mx_out = test_column_stack(*v)
         assert mx_out.shape == expected_np.shape
         assert_almost_equal(mx_out.asnumpy(), expected_np, rtol=rtol, atol=atol)

         # Test gradient
         mx_out.backward()
         for i in range(3):
             expected_grad = g(v_np[i])
             assert_almost_equal(v[i].grad.asnumpy(), expected_grad, rtol=rtol, atol=atol)

         # Test imperative once again
         mx_out = np.column_stack(v)
         expected_np = onp.column_stack(v_np)
         assert_almost_equal(mx_out.asnumpy(), expected_np, rtol=rtol, atol=atol)


 @use_np
 @pytest.mark.skip(reason='Test hangs. Tracked in #18144')
 def test_np_resize():
     class TestResize(HybridBlock):
         def __init__(self, new_shape):
             super(TestResize, self).__init__()
             self._new_shape = new_shape

         def forward(self, x, *args, **kwargs):
             return np.resize(x, self._new_shape)

     dtypes = [np.int8, np.uint8, np.int32, np.int64, np.float16, np.float32, np.float64, np.bool_]
     shape_config = [
         [(), (2, 3)],
         [(2, 3), (2,)],
         [(2, 3), 2],
         [(2, 0, 1), (2, 2)],
         [(2, 0, 1), (3, 4, 5)],
         [((1,)), ()],
     ]
     flags = [True, False]
     for dtype, shape_pair, hybridize in itertools.product(dtypes, shape_config, flags):
         a = np.random.uniform(low=0, high=100, size=shape_pair[0], dtype='float64').astype(dtype)
         test = TestResize(shape_pair[1])
         if hybridize:
             test.hybridize()
         ret = test(a)
         expected_ret = onp.resize(a.asnumpy(), shape_pair[1])
         assert_almost_equal(ret.asnumpy(), expected_ret, atol=1e-5, rtol=1e-5, use_broadcast=False)

         # check imperative again
         ret = np.resize(a, shape_pair[1])
         assert_almost_equal(ret.asnumpy(), expected_ret, atol=1e-5, rtol=1e-5, use_broadcast=False)


 @use_np
 def test_np_diag():
     class TestDiag(HybridBlock):
         def __init__(self, k=0):
             super(TestDiag, self).__init__()
             self._k = k

         def forward(self, a):
             return np.diag(a, k=self._k)

     shapes = [(), (2,), (1, 5), (2, 2), (2, 5), (3, 3), (4, 3)]
     dtypes = [np.int8, np.uint8, np.int32, np.int64, np.float16, np.float32, np.float64]
     range_k = 6
     combination = itertools.product([False, True], shapes, dtypes, list(range(-range_k, range_k)))
     for hybridize, shape, dtype, k in combination:
         rtol = 1e-2 if dtype == np.float16 else 1e-3
         atol = 1e-4 if dtype == np.float16 else 1e-5
         test_diag = TestDiag(k)
         if hybridize:
             test_diag.hybridize()
         x = np.random.uniform(-2.0, 2.0, size=shape).astype(dtype) if len(shape) != 0 else np.array(())
         x.attach_grad()
         np_out = onp.diag(x.asnumpy(), k)
         with mx.autograd.record():
             mx_out = test_diag(x)
         assert mx_out.shape == np_out.shape
         assert_almost_equal(mx_out.asnumpy(), np_out, rtol=rtol, atol=atol)

         # check backward function
         mx_out.backward()
         if len(shape) == 0:
             np_backward = np.array(())
         elif len(shape) == 1:
             np_backward = np.ones(shape[0])
         else:
             np_backward = np.zeros(shape)
             h = shape[0]
             w = shape[1]
             if k > 0:
                 w -= k
             else:
                 h += k
             s = min(w, h)
             if s > 0:
                 if k >= 0:
                     for i in range(s):
                         np_backward[0+i][k+i] = 1
                 else:
                     for i in range(s):
                         np_backward[-k+i][0+i] = 1
         assert_almost_equal(x.grad.asnumpy(), np_backward, rtol=rtol, atol=atol)

         # Test imperative once again
         mx_out = np.diag(x, k)
         np_out = onp.diag(x.asnumpy(), k)
         assert_almost_equal(mx_out.asnumpy(), np_out, rtol=rtol, atol=atol)


 @use_np
 @pytest.mark.parametrize('config', [
     [(1, 5), (0, 1)], [(2, 2), (0, 1)],
     [(2, 5), (0, 1)], [(5, 5), (0, 1)],
     [(2, 2, 2), (0, 1)], [(2, 4, 4), (0, 2)],
     [(3, 3, 3), (1, 2)], [(4, 8, 8), (1, 2)],
     [(4, 4, 4, 4), (1, 2)], [(5, 6, 7, 8), (2, 3)],
     [(6, 7, 8, 9, 10), (3, 4)]
 ])
 @pytest.mark.parametrize('k', [0, 2, 4, 6])
 @pytest.mark.parametrize('dtype', [np.int8, np.uint8, np.int32, np.int64, np.float16, np.float32, np.float64])
 @pytest.mark.parametrize('hybridize', [True, False])
 @pytest.mark.parametrize('call_by_instance', [True, False])
 def test_np_diagonal(config, k, dtype, hybridize, call_by_instance):
     class TestDiagonal(HybridBlock):
         def __init__(self, k=0, axis1=0, axis2=1, call_by_instance=False):
             super(TestDiagonal, self).__init__()
             self._k = k
             self._axis1 = axis1
             self._axis2 = axis2
             self._call_by_instance = call_by_instance

         def forward(self, a):
             if self._call_by_instance:
                 return a.diagonal(self._k, self._axis1, self._axis2)
             else:
                 return np.diagonal(a, self._k, self._axis1, self._axis2)

     rtol = 1e-2 if dtype == np.float16 else 1e-3
     atol = 1e-4 if dtype == np.float16 else 1e-5
     shape, (axis1, axis2) = config
     x = np.random.uniform(-5.0, 5.0, size=shape).astype(dtype)
     x.attach_grad()
     test_diagonal = TestDiagonal(k, axis1, axis2, call_by_instance)
     if hybridize:
         test_diagonal.hybridize()

     if call_by_instance:
         np_out = x.asnumpy().diagonal(offset=k, axis1=axis1, axis2=axis2)
     else:
         np_out = onp.diagonal(x.asnumpy(), offset=k, axis1=axis1, axis2=axis2)
     with mx.autograd.record():
         mx_out = test_diagonal(x)
     assert mx_out.shape == np_out.shape
     assert_almost_equal(mx_out.asnumpy(), np_out, rtol=rtol, atol=atol)

     # check backward function
     mx_out.backward()
     size_out = np_out.size
     shape_out = np_out.shape
     ndim = len(shape)
     h = shape[axis1]
     w = shape[axis2]
     np_backward_slice = onp.zeros((h, w))
     np_backward = onp.zeros(shape)
     if k > 0:
         w -= k
     else:
         h += k
     s = min(w, h)
     if s > 0:
         if k >= 0:
             for i in range(s):
                 np_backward_slice[0+i][k+i] = 1
         else:
             for i in range(s):
                 np_backward_slice[-k+i][0+i] = 1
         ileading = int(size_out/s)
         array_temp = onp.array([np_backward_slice for i in range(ileading)])
         array_temp = array_temp.reshape(shape_out[:-1] + (shape[axis1], shape[axis2]))
         axis_idx = [i for i in range(ndim-2)]
         axis_idx[axis1:axis1] = [ndim - 2]
         axis_idx[axis2:axis2] = [ndim - 1]
         np_backward = onp.transpose(array_temp, tuple(axis_idx))
     assert_almost_equal(x.grad.asnumpy(), np_backward, rtol=rtol, atol=atol)

     # Test imperative once again
     mx_out = np.diagonal(x, k, axis1, axis2)
     np_out = onp.diagonal(x.asnumpy(), offset=k, axis1=axis1, axis2=axis2)
     assert_almost_equal(mx_out.asnumpy(), np_out, rtol=rtol, atol=atol)


 @use_np
 def test_np_nan_to_num():
     def take_ele_grad(ele):
         if onp.isinf(ele) or onp.isnan(ele):
             return 0
         return 1
     def np_nan_to_num_grad(data):
         shape = data.shape
         arr = list(map(take_ele_grad,data.flatten()))
         return onp.array(arr).reshape(shape)

     class TestNanToNum(HybridBlock):
         def __init__(self, copy=True, nan=0.0, posinf=None, neginf=None):
             super(TestNanToNum, self).__init__()
             self.copy = copy
             self.nan = nan
             self.posinf = posinf
             self.neginf = neginf
             # necessary initializations

         def forward(self, a):
             return np.nan_to_num(a, self.copy, self.nan, self.posinf, self.neginf)

     src_list = [
         onp.nan,
         onp.inf,
         -onp.inf,
         1,
         [onp.nan],
         [onp.inf],
         [-onp.inf],
         [1],
         [1,2,3,4,-1,-2,-3,-4,0],
         [onp.nan, onp.inf, -onp.inf],
         [onp.nan, onp.inf, -onp.inf, -574, 0, 23425, 24234,-5],
         [onp.nan, -1, 0, 1],
         [[-433, 0, 456, onp.inf], [-1, -onp.inf, 0, 1]]
     ]

     dtype_list = ['float16', 'float32', 'float64']
     # [nan, inf, -inf]
     param_list = [[None, None, None], [0, 1000, -100], [0.0, 9999.9, -9999.9]]
     # Inplace operations are not supported when recording in deferred compute mode
     # copy_list = [True, False]
     copy_list = [True]
     hybridize_list = [True, False]
     atol, rtol = 1e-5, 1e-3

     src_dtype_comb = list(itertools.product(src_list,dtype_list))
     # check the dtype = int case in both imperative and sympolic expression
     src_dtype_comb.append((1,'int32'))
     src_dtype_comb.append(([234, 0, -40],'int64'))

     combinations = itertools.product(hybridize_list, src_dtype_comb, copy_list, param_list)

     numpy_version = onp.version.version
     for [hybridize, src_dtype, copy, param] in combinations:
         src, dtype = src_dtype
         # np.nan, np.inf, -np.int are float type
         x1 = mx.nd.array(src, dtype=dtype).as_np_ndarray().asnumpy()
         x2 = mx.nd.array(src, dtype=dtype).as_np_ndarray()
         x3 = mx.nd.array(src, dtype=dtype).as_np_ndarray()

         expected_grad = np_nan_to_num_grad(x1)
         x2.attach_grad()
         # with optional parameters or without
         if param[0] !=None and numpy_version>="1.17":
             test_np_nan_to_num = TestNanToNum(copy=copy, nan=param[0], posinf=param[1], neginf=param[2])
             np_out = onp.nan_to_num(x1, copy=copy, nan=param[0], posinf=param[1], neginf=param[2])
             mx_out = np.nan_to_num(x3, copy=copy, nan=param[0], posinf=param[1], neginf=param[2])
         else:
             test_np_nan_to_num = TestNanToNum(copy=copy)
             np_out = onp.nan_to_num(x1, copy=copy)
             mx_out = np.nan_to_num(x3, copy=copy)

         assert_almost_equal(mx_out.asnumpy(), np_out, rtol, atol)
         # check the inplace operation when copy = False
         # if x1.shape = 0, onp.array will not actually execute copy logic
         # only check x3 from np.nan_to_num instead of x2 from gluon
         if copy == False and x1.shape!=():
             assert x1.shape == x3.asnumpy().shape
             assert x1.dtype == x3.asnumpy().dtype
             assert_almost_equal(x1, x3.asnumpy(), rtol=rtol, atol=atol)
         # gluon does not support nan_to_num when copy=False
         # backward will check int type and if so, throw error
         # if not this case, test gluon
         if not (hybridize== False and copy == False) and ('float' in dtype):
             if hybridize:
                 test_np_nan_to_num.hybridize()
             with mx.autograd.record():
                 mx_out_gluon = test_np_nan_to_num(x2)
             assert_almost_equal(mx_out_gluon.asnumpy(), np_out, rtol, atol)
             mx_out_gluon.backward()
             assert_almost_equal(x2.grad.asnumpy(), expected_grad, rtol=1e-3, atol=1e-5)

         # Test imperative once again
         # if copy = False, the value of x1 and x2 has changed
         if copy == True:
             np_out = onp.nan_to_num(x1)
             mx_out = np.nan_to_num(x3)
             assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5, use_broadcast=False)


 @use_np
 def test_np_unary_bool_funcs():
     def check_unary_func(func):
         class TestUnary(HybridBlock):
             def __init__(self, func):
                 super(TestUnary, self).__init__()
                 self._func = func

             def forward(self, a):
                 return getattr(np, self._func)(a)

         src_list = [
             onp.nan,
             onp.inf,
             -onp.inf,
             float('inf'),
             float('-inf'),
             float("nan"),
             onp.array(0)/0,  # nan
             0.0 * onp.inf,  # nan
             onp.inf/onp.inf,  # nan
             onp.inf - onp.inf,  # nan
             onp.array(1)/0,  # inf
             0 + np.inf,  # inf
             1,
             [onp.nan],
             [onp.inf],
             [-onp.inf],
             [onp.array(0)/0],
             [-onp.array(0)/0],
             [onp.inf - onp.inf],  # nan
             [1],
             [1,2,3,4,-1,-2,-3,-4,0],
             [onp.nan, onp.inf, -onp.inf],
             [onp.nan, onp.inf, -onp.inf, -574, 0, 23425, 24234,-5],
             [onp.nan, -1, 0, 1, float('inf'), float('-inf'), float('nan')],
             [[-433, 0, 456, onp.inf], [-1, -onp.inf, 0, 1]]
         ]

         np_func = getattr(onp, func)
         mx_func = TestUnary(func)
         dtype_list = ['float16', 'float32', 'float64']
         hybridize_list = [True, False]
         atol, rtol = 1e-5, 1e-3

         for [hybridize, dtype, src] in itertools.product(hybridize_list, dtype_list, src_list):
             mx_data = mx.np.array(src, dtype=dtype)
             np_data = mx_data.asnumpy()

             if hybridize:
                 mx_func.hybridize()
             with mx.autograd.record():
                 mx_out= mx_func(mx_data)

             assert mx_out.dtype == np.bool_

             np_out = np_func(np_data)
             assert_almost_equal(mx_out.asnumpy(), np_out, rtol, atol)
             # test imperative
             mx_out_imperative = getattr(mx.np, func)(mx_data)
             assert_almost_equal(mx_out_imperative.asnumpy(), np_out, rtol, atol)
             # if `out` is given and dtype == np.bool
             mx_x = np.ones_like(mx_data).astype(np.bool)
             np_x = mx_x.asnumpy()
             getattr(mx.np, func)(mx_data, mx_x)
             np_func(np_data, np_x)
             assert_almost_equal(mx_out_imperative .asnumpy(), np_out, rtol, atol)
             # if `out` is given but dtype mismatches
             mx_y = np.ones_like(mx_data)
             assertRaises(TypeError, getattr(np, func), mx_data, out=mx_y)

             assertRaises(NotImplementedError, getattr(np, func), mx_data, where=False)
             assertRaises(NotImplementedError, getattr(np, func), mx_data,  subok=False)
             assertRaises(NotImplementedError, getattr(np, func), mx_data,  dtype=onp.int8)
             assertRaises(TypeError, getattr(np, func), mx_data,  dtype="abcdefg")
             assertRaises(NotImplementedError, getattr(np, func), mx_data,  casting='safe')
             assertRaises(TypeError, getattr(np, func), mx_data,  casting='mxnet')
             assertRaises(NotImplementedError, getattr(np, func), mx_data,  order='C')
             assertRaises(NotImplementedError, getattr(np, func), mx_data,  order='mxnet')

         # test special shape and dtype
         shape_list = [(), (1,), (2, 3), (4, 0, 5), 6, (7, 8), None]
         dtype_list = ['int32', 'int64', 'float16', 'float32', 'float64']
         for [hybridize, dtype, shape] in itertools.product(hybridize_list, dtype_list, shape_list):
             mx_data = mx.np.random.randint(low=-1, high=1, size=shape).astype(dtype)
             np_data = mx_data.asnumpy()

             if hybridize:
                 mx_func.hybridize()
             with mx.autograd.record():
                 mx_out= mx_func(mx_data)

             assert mx_out.dtype == np.bool_

             np_out = np_func(np_data)
             assert_almost_equal(mx_out.asnumpy(), np_out, rtol, atol)
             mx_out_imperative = getattr(mx.np, func)(mx_data)
             assert_almost_equal(mx_out_imperative .asnumpy(), np_out, rtol, atol)

     check_unary_func("isnan")
     check_unary_func("isinf")
     check_unary_func("isposinf")
     check_unary_func("isneginf")
     check_unary_func("isfinite")


 @use_np
 def test_np_polyval():
     class TestPolyval(HybridBlock):
         def __init__(self):
             super(TestPolyval, self).__init__()

         def forward(self, p, x, *args, **kwargs):
             return np.polyval(p, x)

     def polyval_grad(p, x):
         x_shape = x.shape
         x = x.reshape((x.size, 1))
         x = onp.broadcast_to(x, (x.size, p.size))
         exp = onp.arange(p.size-1, -1, -1)
         p_grad = onp.power(x, exp)
         coeff = exp-1
         coeff[-1] = 0
         x_grad = onp.power(x, coeff) * p * exp
         p_grad = onp.sum(p_grad, axis=0)
         x_grad = onp.sum(x_grad, axis=-1).reshape(x_shape)
         return (p_grad, x_grad)

     dtypes = ['float32', 'float64', 'int32', 'int64']
     x_shapes = [
         (5,),
         (10),
         (3, 3),
         (3, 4),
         (3, 3, 3),
         (2, 2, 4, 3),
         (2, 0, 2, 3)
     ]
     flags = [True, False]
     for dtype, x_shape, hybridize in itertools.product(dtypes, x_shapes, flags):
         p_shape = (random.randint(1, 8),)
         test_polyval = TestPolyval()
         if hybridize:
             test_polyval.hybridize()
         rtol = 1e-2
         atol = 1e-4
         if dtype in ['int32', 'int64']:
             p = np.random.randint(-16, 16, p_shape, dtype=dtype)
             x = np.random.randint(-5, 5, x_shape, dtype=dtype)
         else:
             p = np.random.uniform(-1.0, 1.0, size=p_shape, dtype=dtype)
             x = np.random.uniform(-1.0, 1.0, size=x_shape, dtype=dtype)

         p.attach_grad()
         x.attach_grad()
         np_out = onp.polyval(p.asnumpy(), x.asnumpy())
         with mx.autograd.record():
             mx_out = test_polyval(p, x)
         assert mx_out.shape == np_out.shape
         assert_almost_equal(mx_out.asnumpy(), np_out, atol=atol, rtol=rtol)

         mx_out.backward()
         if dtype in ['float16', 'float32', 'float64']:
             p_grad, x_grad = polyval_grad(p.asnumpy(), x.asnumpy())
             assert_almost_equal(p.grad.asnumpy(), p_grad, atol=atol, rtol=rtol)
             assert_almost_equal(x.grad.asnumpy(), x_grad, atol=atol, rtol=rtol)

         mx_out = np.polyval(p, x)
         np_out = onp.polyval(p.asnumpy(), x.asnumpy())
         assert_almost_equal(mx_out.asnumpy(), np_out, atol=atol, rtol=rtol)


 @use_np
 def test_np_where():
     class TestWhere(HybridBlock):
         def __init__(self):
             super(TestWhere, self).__init__()

         def forward(self, cond, x, y):
             return np.where(cond, x, y)

     dtypes = [np.int8, np.uint8, np.int32, np.int64, np.float16, np.float32, np.float64, np.bool]
     shape_configs = [
         [(), (2, 3), (4, 1, 3)],
         [(), (4, 1, 3), (2, 3)],
         [(2, 3), (4, 1, 3), ()],
         [(4, 1, 3), (2, 3), ()],
         [(2, 3), (), (4, 1, 3)],
         [(2, 3), (2, 3), (2, 3)],
         [(2, 3), (2, 1), (2, 3)],
         [(2, 1), (2, 3), (2, 3)],
         [(2, 3), (2, 3), (2, 1)]
     ]
     flags = [True, False]
     for ctype, dtype, shape_pair, hybridize in itertools.product(dtypes, dtypes, shape_configs, flags):
         cond = np.round(np.random.uniform(low=0, high=2, size=shape_pair[0], dtype='float64')).astype(ctype)
         x = np.random.uniform(low=0, high=100, size=shape_pair[1], dtype='float64').astype(dtype)
         y = np.random.uniform(low=0, high=100, size=shape_pair[2], dtype='float64').astype(dtype)
         cond.attach_grad()
         x.attach_grad()
         y.attach_grad()
         test_mod = TestWhere()
         if hybridize:
             test_mod.hybridize()
         with mx.autograd.record():
             ret = test_mod(cond, x, y)

         assert same(ret.asnumpy(), onp.where(cond.asnumpy(), x.asnumpy(), y.asnumpy()))
         if dtype in [np.float16, np.float32, np.float64]:
             ret.backward()
             assert same(cond.grad.asnumpy(), onp.zeros(shape_pair[0], dtype=ctype))

             xgrad = x.grad.asnumpy()
             npgrad = collapse_sum_like((onp.broadcast_to(cond.asnumpy(), ret.shape) != 0).astype(dtype), shape_pair[1])
             npgrad = npgrad.astype(xgrad.dtype)
             assert same(xgrad, npgrad)

         # check imperative again
         ret = np.where(cond, x, y)
         assert same(ret.asnumpy(), onp.where(cond.asnumpy(), x.asnumpy(), y.asnumpy()))

         # check scalar case
         if dtype in [np.float16, np.float32, np.float64]:
             # lscalar
             with mx.autograd.record():
                 ret_lscalar = np.where(cond, 1, x)
             assert same(ret_lscalar.asnumpy(), onp.where(cond.asnumpy(), 1, x.asnumpy()))
             ret_lscalar.backward()

             xgrad = x.grad.asnumpy()
             npgrad = collapse_sum_like((onp.broadcast_to(cond.asnumpy(), ret_lscalar.shape) == 0).astype(dtype), shape_pair[1])
             npgrad = npgrad.astype(xgrad.dtype)
             assert same(xgrad, npgrad)
             # rscalar
             with mx.autograd.record():
                 ret_rscalar = np.where(cond, x, 1)
             assert same(ret_rscalar.asnumpy(), onp.where(cond.asnumpy(), x.asnumpy(), 1))
             ret_rscalar.backward()

             xgrad = x.grad.asnumpy()
             npgrad = collapse_sum_like((onp.broadcast_to(cond.asnumpy(), ret_rscalar.shape) != 0).astype(dtype), shape_pair[1])
             npgrad = npgrad.astype(xgrad.dtype)
             assert same(xgrad, npgrad)

         # check both scalar case
         x = onp.random.randint(0, 100)
         y = onp.random.randint(0, 100)
         mx_out = np.where(cond, x, y)
         np_out = onp.where(cond, x, y)
         assert same(mx_out, np_out)


 @use_np
 def test_np_expand_dims():
     class TestExpandDims(HybridBlock):
         def __init__(self, axis):
             super(TestExpandDims, self).__init__()
             self._axis = axis

         def forward(self, x):
             return np.expand_dims(x, self._axis)

     dtypes = [np.int8, np.uint8, np.int32, np.int64, np.float16, np.float32, np.float64, np.bool]
     shapes = [
         (),
         (0,),
         (0, 1),
         (3,),
         (1, 2, 3),
     ]
     flags = [True, False]
     for dtype, shape, hybridize in itertools.product(dtypes, shapes, flags):
         ndim = len(shape)
         for axis in range(-ndim-1, ndim+1):
             x_np = onp.random.uniform(0, 100, size=shape).astype(dtype)
             expected = onp.expand_dims(x_np, axis)
             for req in ['write', 'add']:
                 test_expand_dims = TestExpandDims(axis)
                 if hybridize:
                     test_expand_dims.hybridize()

                 x = np.array(x_np)
                 x.attach_grad(req)
                 initial_grad = np.random.uniform(0, 10, size=x.shape).astype(x.dtype)
                 x.grad[()] = initial_grad
                 with mx.autograd.record():
                     y = test_expand_dims(x)
                 y.backward()

                 assert_almost_equal(y.asnumpy(), expected, use_broadcast=False)
                 if req == 'null':
                     assert same(x.grad.asnumpy(), initial_grad.asnumpy())
                 elif req == 'write':
                     assert same(x.grad.asnumpy(), onp.ones_like(x.asnumpy()))
                 else:
                     assert_almost_equal(x.grad.asnumpy(), initial_grad.asnumpy() + onp.ones_like(initial_grad.asnumpy()),
                                         atol=1e-2 if dtype is np.float16 else 1e-4,
                                         rtol=1e-2 if dtype is np.float16 else 1e-4,
                                         use_broadcast=False)

                 # check imperative again
                 y = np.expand_dims(x, axis)
                 assert_almost_equal(y.asnumpy(), expected, use_broadcast=False)


 @use_np
 @pytest.mark.parametrize('ishape', [
     2, 5,
     (), (1,), (4,),
     (2, 2), (2, 4), (3, 5),
     (2, 2, 2), (2, 3, 2), (2, 3, 4),
 ])
 @pytest.mark.parametrize('rshape', [
     10, (15,),
     (3, 4), (4, 5),
     (2,3,4)
 ])
 @pytest.mark.parametrize('dtype', [np.uint8, np.int8, np.int32, np.int64])
 @pytest.mark.parametrize('hybridize', [True, False])
 def test_np_unravel_index(ishape, rshape, dtype, hybridize):
     class TestUnravel_index(HybridBlock):
         def __init__(self, shape, order='C') :
             super(TestUnravel_index, self).__init__()
             self._shape = shape
             self._order = order

         def forward(self, a):
             return np.unravel_index(a, self._shape, self._order)


     rtol = 1e-2 if dtype == np.float16 else 1e-3
     atol = 1e-4 if dtype == np.float16 else 1e-5
     test_unravel_index = TestUnravel_index(rshape)
     if hybridize:
         test_unravel_index.hybridize()
     if type(ishape) == int and hybridize:
         x = np.array([ishape], dtype=dtype)
         np_out = onp.unravel_index(x.asnumpy(), rshape)
     else:
         x = np.random.uniform(0, 8, size=ishape).astype(dtype)
         np_out = onp.unravel_index(x.asnumpy(), rshape)
     mx_out = test_unravel_index(x)
     assert len(mx_out) == len(np_out)
     for elem_mx, elem_np in zip(mx_out, np_out):
         assert elem_mx.asnumpy().shape == elem_np.shape
         assert_almost_equal(elem_mx.asnumpy(), elem_np, rtol=rtol, atol=atol)
     # no backward function for unravel_index operator

     # Test imperative once again
     mx_out = np.unravel_index(x, rshape)
     np_out = onp.unravel_index(x.asnumpy(), rshape)
     print(np_out)
     assert len(mx_out) == len(np_out)
     for elem_mx, elem_np in zip(mx_out, np_out):
         assert elem_mx.asnumpy().shape == elem_np.shape
         assert_almost_equal(elem_mx.asnumpy(), elem_np, rtol=rtol, atol=atol)


 @use_np
 def test_np_diag_indices_from():
     class TestDiag_indices_from(HybridBlock):
         def __init__(self) :
             super(TestDiag_indices_from, self).__init__()

         def forward(self, a):
             return np.diag_indices_from(a)

     dtypes = [np.int8, np.uint8, np.int32, np.int64, np.float16, np.float32, np.float64]
     shapes = [(2, 2), (4, 4), (5, 5, 5), (6, 6, 6, 6), (8, 8, 8, 8)]
     combinations = itertools.product([False, True], dtypes, shapes)
     for hybridize, dtype, shape in combinations:
         rtol = 1e-2 if dtype == np.float16 else 1e-3
         atol = 1e-4 if dtype == np.float16 else 1e-5
         test_diag_indices_from = TestDiag_indices_from()
         if hybridize:
             test_diag_indices_from.hybridize()
         x = np.random.uniform(-8, 8, size=shape).astype(dtype)
         mx_out = test_diag_indices_from(x)
         np_out = onp.diag_indices_from(x.asnumpy())
         assert len(mx_out) == len(np_out)
         for elem_mx, elem_np in zip(mx_out, np_out):
             assert elem_mx.asnumpy().shape == elem_np.shape
             assert_almost_equal(elem_mx.asnumpy(), elem_np, rtol=rtol, atol=atol)
         # no backward function for diag_indices_from operator

         # Test imperative once again
         mx_out = np.diag_indices_from(x)
         np_out = onp.diag_indices_from(x.asnumpy())
         assert len(mx_out) == len(np_out)
         for elem_mx, elem_np in zip(mx_out, np_out):
             assert elem_mx.asnumpy().shape == elem_np.shape
             assert_almost_equal(elem_mx.asnumpy(), elem_np, rtol=rtol, atol=atol)


 @use_np
 def test_np_interp():
     class TestInterp(HybridBlock):
         def __init__(self, left=None, right=None, period=None):
             super(TestInterp, self).__init__()
             self._left = left
             self._right = right
             self._period = period

         def forward(self, x, xp, fp):
             return np.interp(x, xp, fp, left=self._left, right=self._right, period=self._period)

     class TestInterpScalar(HybridBlock):
         def __init__(self, x=None, left=None, right=None, period=None):
             super(TestInterpScalar, self).__init__()
             self._x = x
             self._left = left
             self._right = right
             self._period = period

         def forward(self, xp, fp):
             return np.interp(self._x, xp, fp, left=self._left, right=self._right, period=self._period)

     xtypes = [np.int64, np.float32, np.float64]
     dtypes = [np.int32, np.int64, np.float32, np.float64]
     xshapes = [
         (), (3,), (5,), (20,),
         (2, 2), (4, 4), (8, 8),
         (5, 5, 5), (8, 0, 8)
     ]
     dsizes = [10, 30]
     periods = [None, 2*np.pi]
     lefts = [None, -10, 0]
     rights= [None, 20, 50]
     flags = [True, False]
     combinations = itertools.product(flags, flags, xshapes, dsizes, xtypes, dtypes, lefts, rights, periods)
     for hybridize, x_scalar, xshape, dsize, xtype, dtype, left, right, period in combinations:
         rtol = 1e-3
         atol = 1e-5
         if period is not None:
             x = np.random.uniform(-np.pi, np.pi, size=xshape).astype(xtype)
             xp = np.random.uniform(0, 2*np.pi, size=dsize)
             fp = np.sin(xp)
         else:
             x = np.random.uniform(0, 100, size=xshape).astype(xtype)
             xp = np.sort(np.random.choice(100, dsize, replace=False).astype(dtype))
             fp = np.random.uniform(-50, 50, size=dsize).astype(dtype)
         np_x = x.asnumpy()
         if x_scalar and xshape == ():
             x = x.item()
             np_x = x
             test_interp = TestInterpScalar(x=x, left=left, right=right, period=period)
         else:
             test_interp = TestInterp(left=left, right=right, period=period)
         if hybridize:
             test_interp.hybridize()
         mx_out = test_interp(xp, fp) if (x_scalar and xshape == ()) else test_interp(x, xp, fp)
         np_out = onp.interp(np_x, xp.asnumpy(), fp.asnumpy(), left=left, right=right, period=period)
         assert mx_out.shape == np_out.shape
         assert_almost_equal(mx_out.asnumpy(), np_out, atol=atol, rtol=rtol)

         mx_out = np.interp(x, xp, fp, left=left, right=right, period=period)
         np_out = onp.interp(np_x ,xp.asnumpy(), fp.asnumpy(), left=left, right=right, period=period)
         assert_almost_equal(mx_out.asnumpy(), np_out, atol=atol, rtol=rtol)


 @use_np
 def test_np_bincount():
     class TestBincount(HybridBlock):
         def __init__(self, minlength=0):
             super(TestBincount, self).__init__()
             self._minlength = minlength

         def forward(self, a):
             return np.bincount(a, None, self._minlength)

     class TestBincountWeights(HybridBlock):
         def __init__(self, minlength=0):
             super(TestBincountWeights, self).__init__()
             self._minlength = minlength

         def forward(self, a, weights):
             return np.bincount(a, weights, self._minlength)

     dtypes = [np.int8, np.uint8, np.int32, np.int64]
     weight_types = [np.int32, np.int64, np.float16, np.float32, np.float64]
     shapes = [(), (5,), (10,), (15,), (20,), (30,), (50,)]
     min_lengths = [0, 5, 20, 50]
     has_weights = [True, False]
     combinations = itertools.product([True, False], shapes, dtypes, weight_types, has_weights, min_lengths)
     for hybridize, shape, dtype, weight_type, has_weight, minlength in combinations:
         rtol = 1e-2 if weight_type == np.float16 else 1e-3
         atol = 1e-4 if weight_type == np.float16 else 1e-5
         if shape != ():
             data = np.random.uniform(0, 10, size=shape).astype(dtype)
             weights = np.random.uniform(0, 10, size=shape).astype(weight_type) if has_weight else None
         else:
             data = np.array(()).astype(dtype)
             weights = np.array(()).astype(weight_type) if has_weight else None
         weights_np = weights.asnumpy() if has_weight else None
         test_bincount = TestBincountWeights(minlength) if has_weight else TestBincount(minlength)
         if hybridize:
             test_bincount.hybridize()
         mx_out = test_bincount(data, weights) if has_weight else test_bincount(data)
         np_out = onp.bincount(data.asnumpy(), weights_np, minlength)
         assert mx_out.shape == np_out.shape
         assert_almost_equal(mx_out.asnumpy(), np_out, rtol=rtol, atol=atol)
         # No backward operation for operator bincount at this moment

         # Test imperative once again
         mx_out = np.bincount(data, weights, minlength)
         np_out = onp.bincount(data.asnumpy(), weights_np, minlength)
         assert_almost_equal(mx_out.asnumpy(), np_out, rtol=rtol, atol=atol)


 @use_np
 @pytest.mark.skip(reason='Test hangs. Tracked in #18144')
 def test_np_empty_like():
     class TestEmptyLike(HybridBlock):
         def __init__(self, dtype, order, subok):
             super(TestEmptyLike, self).__init__()
             self._dtype = dtype
             self._order = order
             self._subok = subok

         def forward(self, x, *args, **kwargs):
             return np.empty_like(x, self._dtype, self._order, self._subok)

     if StrictVersion(platform.python_version()) < StrictVersion('3.0.0'):
         return

     dtypes = [None, 'float16', 'float32', np.int8, np.uint8, np.int32, np.int64,
               np.float16, np.float32, np.float64, np.bool_]
     shapes = [
         (),
         (1,),
         (5,),
         (4, 3),
         (3, 5),
         (4, 4),
         (4, 5),
         (5, 5),
         (5, 6),
         (6, 6),
         (0, 1),
         (6, 5, 6),
         (2, 3, 3, 4),
         (4, 2, 1, 2),
         (0, 5, 3, 3),
         (5, 0, 3, 3),
         (3, 3, 0, 0),
     ]
     orders = ["C"]
     subok_list = [False]
     flags = [False]
     _np_version = onp.version.version
     for dtype, shape, hybridize, order, subok in itertools.product(dtypes, shapes, flags, orders, subok_list):
         prototype = np.random.uniform(low=0, high=100, size=shape, dtype='float64').astype(dtype)
         test = TestEmptyLike(dtype, order, subok)
         if StrictVersion(_np_version) >= StrictVersion('1.6.0'):
             expected_ret = onp.empty_like(prototype, dtype=dtype, order=order, subok=subok)
         else:
             expected_ret = onp.empty_like(prototype)
         if hybridize:
             test.hybridize()
         ret = test(prototype)
         assert ret.asnumpy().shape == expected_ret.shape

         # check imperative again
         ret = np.empty_like(prototype, dtype, order, subok)
         assert ret.asnumpy().shape == expected_ret.shape


 @use_np
 @pytest.mark.parametrize('hybridize', [True, False])
 @pytest.mark.parametrize('dtype', [np.float32, np.float64])
 @pytest.mark.parametrize('a_shape,b_shape,axes', [
     # - 2 x 2
     ((2,), (2,), (-1, -1, -1)),
     ((1, 2), (1, 2), (-1, -1, -1)),
     ((1, 2), (2, 2), (-1, -1, -1)),
     ((2, 2), (1, 2), (-1, -1, -1)),
     ((2, 2), (2, 2), (-1, -1, -1)),
     ((1, 2), (2, 2), (-1, 0, -1)),
     ((2, 2), (1, 2), (0, -1, -1)),
     ((2, 2), (2, 2), (0, 0, -1)),
     ((2, 2), (2, 2), (0, 0, 0)),
     ((5, 4, 3, 2), (5, 4, 3, 2), (-1, -1, -1)),
     ((1, 4, 3, 2), (5, 1, 3, 2), (-1, -1, -1)),
     ((5, 4, 3, 2), (5, 4, 3, 2), (-1, -1, 0)),
     ((2, 5, 4, 3), (5, 2, 4, 3), (0, 1, 2)),
     ((2, 5, 1, 3), (1, 2, 4, 3), (0, 1, 2)),
     # - 2 x 3
     ((2,), (3,), (-1, -1, -1)),
     ((1, 2,), (1, 3,), (-1, -1, -1)),
     ((2, 2,), (2, 3,), (0, -1, 0)),
     ((1, 2,), (2, 3,), (-1, -1, -1)),
     ((2, 2,), (1, 3,), (-1, -1, -1)),
     ((2, 1,), (3, 4,), (0, 0, 0)),
     ((2, 1, 3), (4, 3, 1), (0, 1, 2)),
     ((6, 5, 4, 2), (6, 5, 4, 3), (-1, -1, -1)),
     ((2, 6, 5, 4), (6, 5, 4, 3), (0, -1, 2)),
     ((2, 6, 5, 4), (6, 3, 5, 4), (0, 1, 2)),
     ((6, 2, 5, 4), (6, 5, 3, 4), (1, 2, 0)),
     ((6, 2, 1, 4), (1, 5, 3, 4), (1, 2, 0)),
     # - 3 x 2
     ((3,), (2,), (-1, -1, -1)),
     ((1, 3,), (1, 2,), (-1, -1, -1)),
     ((2, 3,), (2, 2,), (-1, 0, 0)),
     ((2, 3,), (1, 2,), (-1, -1, -1)),
     ((2, 3,), (1, 2,), (-1, -1, -1)),
     ((3, 4, 4), (1, 1, 2,), (0, -1, 0)),
     ((3, 4, 4), (1, 2, 1,), (0, 1, 2)),
     ((6, 5, 4, 3), (6, 5, 4, 2), (-1, -1, -1)),
     ((3, 6, 5, 4), (6, 5, 4, 2), (0, -1, 2)),
     ((3, 6, 5, 4), (6, 2, 5, 4), (0, 1, 2)),
     ((6, 3, 5, 4), (6, 5, 2, 4), (1, 2, 0)),
     ((6, 3, 1, 4), (1, 5, 2, 4), (1, 2, 0)),
     # - 3 x 3
     ((3,), (3,), (-1, -1, -1)),
     ((1, 3,), (1, 3,), (-1, -1, -1)),
     ((2, 3,), (3, 2,), (-1, 0, 0)),
     ((1, 3,), (3, 2,), (-1, 0, 0)),
     ((1, 3,), (3, 4,), (-1, 0, 0)),
     ((1, 1, 3,), (3, 2, 2), (-1, 0, 0)),
     ((1, 1, 2, 3,), (3, 2, 2, 2), (-1, 0, 0)),
     ((6, 5, 4, 3), (6, 5, 4, 3), (-1, -1, -1)),
     ((3, 6, 5, 4), (6, 5, 4, 3), (0, -1, 2)),
     ((3, 6, 5, 4), (6, 3, 5, 4), (0, 1, 2)),
     ((6, 3, 5, 4), (6, 5, 3, 4), (1, 2, 0)),
     ((6, 3, 1, 4), (1, 5, 3, 4), (1, 2, -1)),

     # - (a_shape, b_shape, None)
     ((2,), (2,), None),
     ((2,), (3,), None),
     ((3,), (2,), None),
     ((3,), (3,), None),
     ((5, 4, 3, 2), (5, 4, 3, 2), None),
     ((6, 5, 4, 2), (6, 5, 4, 3), None),
     ((6, 5, 4, 3), (6, 5, 4, 2), None),
     ((6, 5, 4, 3), (6, 5, 4, 3), None),
     ((1, 4, 3, 2), (5, 1, 3, 2), None),
     ((6, 1, 4, 2), (6, 5, 1, 3), None),
     ((6, 5, 1, 3), (1, 5, 4, 2), None),
     ((1, 5, 4, 3), (6, 5, 1, 3), None),

     # - (a_shape, b_shape, (a_axis, b_axis, c_axis, axis))
     ((2, 5, 4, 3), (2, 5, 4, 3), (-1, -1, -1, 0,)),
     ((6, 2, 5, 4), (6, 3, 5, 4), (-1, -1, -1, 1,)),
     ((6, 5, 3, 4), (6, 5, 2, 4), (-1, -1, -1, 2,)),
     ((6, 5, 4, 3), (6, 5, 4, 3), (-1, -1, -1, 3,)),
 ])
 def test_np_cross(a_shape, b_shape, axes, dtype, hybridize):
     class TestNumpyCross(HybridBlock):
         def __init__(self, axisa=-1, axisb=-1, axisc=-1, axis=None):
             super(TestNumpyCross, self).__init__()
             self._axisa = axisa
             self._axisb = axisb
             self._axisc = axisc
             self._axis = axis

         def forward(self, a, b):
             return np.cross(a, b, self._axisa, self._axisb, self._axisc, self._axis)

     def check_np_cross(x, a_np, b_np, axises):
         try:
             if axises is None:
                 x_expected = onp.cross(a_np, b_np)
             elif len(axises) == 4:
                 (a_axis, b_axis, c_axis, axis,) = axises
                 x_expected = onp.cross(a_np, b_np, axisa=a_axis, axisb=b_axis, axisc=c_axis, axis=axis)
             else:
                 (a_axis, b_axis, c_axis,) = axises
                 x_expected = onp.cross(a_np, b_np, axisa=a_axis, axisb=b_axis, axisc=c_axis)
         except Exception as e:
             print("a:", a_np)
             print("a shape:", a_np.shape)
             print("b:", b_np)
             print("b shape:", b_np.shape)
             print(e)
         else:
             assert x.shape == x_expected.shape
             assert_almost_equal(x.asnumpy(), x_expected, rtol=rtol, atol=atol)

     def check_not_use_broadcast(a_np, b_np, axises):
         a_shape = a_np.shape
         b_shape = b_np.shape
         if axises is None:
             return a_shape[:-1] == b_shape[:-1]
         elif len(axises) == 4:
             axis = axises[3]
             a_moveaxis_shape = onp.moveaxis(a_np, axis, -1).shape
             b_moveaxis_shape = onp.moveaxis(b_np, axis, -1).shape
             return a_moveaxis_shape[:-1] == b_moveaxis_shape[:-1]
         else:
             a_axis = axises[0]
             b_axis = axises[1]
             a_moveaxis_shape = onp.moveaxis(a_np, a_axis, -1).shape
             b_moveaxis_shape = onp.moveaxis(b_np, b_axis, -1).shape
             return a_moveaxis_shape[:-1] == b_moveaxis_shape[:-1]

     # calculate dL = gradC * dC
     def cal_dL(grad_c_move, dc_move):
         num = int(onp.prod(dc_move.shape))
         grad_c_move_1d = grad_c_move.reshape((num,))
         dc_move_1d = dc_move.reshape((num,))
         dL = onp.inner(grad_c_move_1d, dc_move_1d)
         return dL

     # get reduced axis index
     def get_reduce_axis(shape, broad_shape):
         axis = list()
         length = len(broad_shape) if len(shape) == len(broad_shape) + 1 else len(broad_shape) - 1
         for i in range(length):
             if shape[i] != broad_shape[i]:
                 axis.append(i)
         return tuple(axis) if len(axis) > 0 else None

     # get grad_a and grad_b
     def get_cross_backward(a, b, axises):
         if axises == None:
             a_axis, b_axis, c_axis = (-1,) * 3
         elif len(axises) == 4:
             a_axis, b_axis, c_axis = (axises[-1],) * 3
         else:
             (a_axis, b_axis, c_axis) = axises
         c = onp.cross(a, b, axisa=a_axis, axisb=b_axis, axisc=c_axis)
         c_move = onp.moveaxis(c, c_axis, -1) if a.shape[a_axis] == 3 or b.shape[b_axis] == 3 else c
         grad_c_move = onp.ones(shape=c_move.shape, dtype=c_move.dtype)
         a_move = onp.moveaxis(a, a_axis, -1)
         b_move = onp.moveaxis(b, b_axis, -1)
         da_move = onp.random.uniform(-1., 1., size=a_move.shape)
         db_move = onp.random.uniform(-1., 1., size=b_move.shape)
         # dC = dA x B + A x dB
         dc_move = onp.cross(da_move, b_move) + onp.cross(a_move, db_move)
         # dL1 = Tr(grad_C.T * dC) = dL/dCi * dCi
         dL1 = cal_dL(grad_c_move, dc_move)
         # check cross backward.
         if a.shape[a_axis] == 2 and b.shape[b_axis] == 2:
             # Case 1: a.shape[-1] == 2 and b.shape[-1] == 2, param.axisc is ignored.
             shape = grad_c_move.shape if grad_c_move.ndim != 0 else (1,)
             grad_a_move = onp.empty(shape, dtype=a_move.dtype)
             grad_b_move = onp.empty(shape, dtype=b_move.dtype)
             grad_a_move = onp.expand_dims(grad_a_move, -1).repeat(2, axis=-1)
             grad_b_move = onp.expand_dims(grad_b_move, -1).repeat(2, axis=-1)
             a_move_0 = a_move[..., 0]
             a_move_1 = a_move[..., 1]
             b_move_0 = b_move[..., 0]
             b_move_1 = b_move[..., 1]
             grad_a_move_0 = grad_c_move * b_move_1
             grad_a_move_1 = grad_c_move * b_move_0
             if grad_a_move_1.ndim == 0:
                 grad_a_move_1 = -grad_a_move_1
             else:
                 onp.negative(grad_a_move_1, out=grad_a_move_1)
             grad_b_move_0 = grad_c_move * a_move_1
             grad_b_move_1 = grad_c_move * a_move_0
             if grad_b_move_0.ndim == 0:
                 grad_b_move_0 = -grad_b_move_0
             else:
                 onp.negative(grad_b_move_0, out=grad_b_move_0)
             grad_a_move[..., 0] = grad_a_move_0
             grad_a_move[..., 1] = grad_a_move_1
             grad_b_move[..., 0] = grad_b_move_0
             grad_b_move[..., 1] = grad_b_move_1
         else:
             # Case 4: a.shape[-1] == 3 and b.shape[-1] == 3, param.axisc is not ignored.
             grad_a_move = onp.cross(b_move, grad_c_move)
             grad_b_move = onp.cross(grad_c_move, a_move)
             if a.shape[a_axis] == 2:
                 # Case 2: a.shape[-1] == 2 and b.shape[-1] == 3, param.axisc is not ignored.
                 grad_a_move = onp.delete(grad_a_move, obj=-1, axis=-1)
             if b.shape[b_axis] == 2:
                 # Case 3: a.shape[-1] == 3 and b.shape[-1] == 2, param.axisc is not ignored.
                 grad_b_move = onp.delete(grad_b_move, obj=-1, axis=-1)

         if not check_not_use_broadcast(a, b, axises):
             a_broad_axis = get_reduce_axis(a_move.shape, c_move.shape)
             b_broad_axis = get_reduce_axis(b_move.shape, c_move.shape)
             if a_broad_axis is not None:
                 grad_a_move_reduce = onp.ones_like(a_move)
                 grad_a_move_reduce = onp.sum(grad_a_move, axis=a_broad_axis, out=grad_a_move_reduce, keepdims=True)
                 grad_a_move = grad_a_move_reduce
             if b_broad_axis is not None:
                 grad_b_move_reduce = onp.ones_like(b_move)
                 grad_b_move_reduce = onp.sum(grad_b_move, axis=b_broad_axis, out=grad_b_move_reduce, keepdims=True)
                 grad_b_move = grad_b_move_reduce
         # dL2 = dL/dAi * dAi + dL/dBi * dBi
         dL2 = cal_dL(grad_a_move, da_move) + cal_dL(grad_b_move, db_move)
         assert_almost_equal(dL1, dL2, rtol=rtol, atol=atol)
         # move working axis
         return onp.moveaxis(grad_a_move, -1, a_axis), onp.moveaxis(grad_b_move, -1, b_axis)

     rtol = 1e-3
     atol = 1e-5
     if axes is None:
         a_axis, b_axis, c_axis = (-1,) * 3
         test_numpy_cross = TestNumpyCross()
     elif len(axes) == 4:
         (a_axis, b_axis, c_axis, axis,) = axes
         test_numpy_cross = TestNumpyCross(axisa=a_axis, axisb=b_axis, axisc=c_axis, axis=axis)
     else:
         (a_axis, b_axis, c_axis,) = axes
         test_numpy_cross = TestNumpyCross(axisa=a_axis, axisb=b_axis, axisc=c_axis)
     if hybridize:
         test_numpy_cross.hybridize()
     a_np = onp.random.uniform(-10., 10., size=a_shape)
     b_np = onp.random.uniform(-10., 10., size=b_shape)
     a = np.array(a_np, dtype=dtype)
     b = np.array(b_np, dtype=dtype)
     a.attach_grad()
     b.attach_grad()

     # check cross validity
     with mx.autograd.record():
         mx_out = test_numpy_cross(a, b)
     check_np_cross(mx_out, a.asnumpy(), b.asnumpy(), axes)

     # check cross backward
     mx.autograd.backward(mx_out)
     grad_a_expected, grad_b_expected = get_cross_backward(a.asnumpy(), b.asnumpy(), axes)
     assert_almost_equal(a.grad.asnumpy(), grad_a_expected, rtol=rtol, atol=atol)
     assert_almost_equal(b.grad.asnumpy(), grad_b_expected, rtol=rtol, atol=atol)

     # check imperative once again
     mx_out = test_numpy_cross(a, b)
     check_np_cross(mx_out, a.asnumpy(), b.asnumpy(), axes)


 @use_np
 def test_np_rollaxis():
     class TestRollaxis(HybridBlock):
         def __init__(self, axis=0, start=0):
             super(TestRollaxis, self).__init__()
             self._axis = axis
             self._start = start

         def forward(self, a, *args, **kwargs):
             return np.rollaxis(a, axis=self._axis, start=self._start)

     dtypes = ['int32', 'int64', 'float16', 'float32', 'float64']
     for hybridize in [False, True]:
         for dtype in dtypes:
             for ndim in [0, 1, 2, 3, 4, 5, 6, 7, 8]:
                 shape = rand_shape_nd(ndim, dim=5, allow_zero_size=True)
                 np_data = onp.random.uniform(low=-100, high=100, size=shape).astype(dtype)
                 mx_data = np.array(np_data, dtype=dtype)
                 for axis in range(-ndim, ndim):
                     for start in range(-ndim, ndim + 1):
                         # test gluon
                         test_rollaxis = TestRollaxis(axis, start)
                         if hybridize:
                             test_rollaxis.hybridize()
                         np_out = onp.rollaxis(np_data, axis=axis, start=start)
                         mx_data.attach_grad()
                         with mx.autograd.record():
                             mx_out = test_rollaxis(mx_data)
                         assert mx_out.shape == np_out.shape
                         mx_out.backward()
                         assert same(mx_data.grad.shape, mx_data.shape)
                         assert same(mx_data.grad.asnumpy(), onp.ones(shape))
                         # test imperative
                         np_out = onp.rollaxis(np_data, axis=axis, start=start)
                         mx_out = np.rollaxis(mx_data, axis=axis, start=start)
                         assert np_out.dtype == mx_out.dtype
                         assert same(mx_out.asnumpy(), np_out)


 @use_np
 def test_npx_stop_gradient():
     class TestStopGradient(HybridBlock):
         def forward(self, a):
             return npx.stop_gradient(a)
     dtypes = ['float16', 'float32', 'float64']
     for hybridize in [False, True]:
         for dtype in dtypes:
             for grad_req in ['write', 'add']:
                 dat = np.ones((10,), dtype=dtype)
                 dat.attach_grad(grad_req)
                 dat.grad[:] = 2
                 old_grad = dat.grad.asnumpy()
                 net = TestStopGradient()
                 if hybridize:
                     net.hybridize()
                 with mx.autograd.record():
                     out = net(dat)
                     out = out + dat
                     out.backward()
                 new_grad = dat.grad.asnumpy()
                 assert same(out.asnumpy(), dat.asnumpy() * 2)
                 if grad_req == 'write':
                     assert_almost_equal(new_grad, onp.ones_like(dat, dtype=dtype))
                 elif grad_req == 'add':
                     assert_almost_equal(new_grad, old_grad + 1)


 def test_npx_broadcast_like_different_types():
     x = mx.np.zeros((2, 1))
     y = mx.np.ones((2, 2))

     y = mx.np.array(y).astype('int32')
     z = mx.npx.broadcast_like(x, y)
     assert_almost_equal(z.asnumpy(), np.array([[0,0],[0,0]]))
     assert x.dtype == z.dtype


 @use_np
 def test_np_elementwise_ops_on_misaligned_input():
     a = np.array([1,2,3,4], dtype='float16')
     b = np.array([1,2,3,4], dtype='float16')

     c = a[1:3]
     d = b[1:3]
     # Note: testing just elemwise_add since all elemwise_ops
     #       share the implementation
     c[:] = c + d
     mx.nd.waitall()

     a = np.array([1,2,3,4], dtype='float16')
     b = np.array([1,2,3,4], dtype='float16')

     c = a[0:3]
     d = b[0:3]
     c[:] = c + d
     mx.nd.waitall()
     assert a[3] == 4.0


 @use_np
 @pytest.mark.parametrize('dtype', ['float16', 'float32', 'float64'])
 @pytest.mark.parametrize('lead_dim', [2, 3, 4, 6, 10])
 @pytest.mark.parametrize('both_ways', [False, True])
 def test_np_broadcast_ops_on_misaligned_input(dtype, lead_dim, both_ways):
     shape = list(rand_shape_2d()) + [lead_dim]
     small_shape = [shape[0], 1, lead_dim]
     if both_ways:
         # Broadcast in both ways [1, K, L] x [M, 1, L]
         big_shape = [1, shape[1], lead_dim]
     else:
         big_shape = shape
     size = onp.product(shape)
     small_size = onp.product(small_shape)
     big_size = onp.product(big_shape)
     a = np.arange(5000)
     b = np.arange(5000)
     e = np.arange(5000)
     c = a[1:big_size + 1].reshape(tuple(big_shape))
     d = b[1:small_size + 1].reshape(tuple(small_shape))
     f = e[1:size + 1].reshape(tuple(shape))
     f[:] = c + d
     expected = c.asnumpy() + d.asnumpy()
     mx.nd.waitall()
     assert_almost_equal(f, expected)


 @use_np
 @pytest.mark.parametrize('dtype', ['float16', 'float32', 'float64'])
 @pytest.mark.parametrize('lead_dim', [2, 3, 4, 6, 10])
 @pytest.mark.parametrize('both_ways', [False, True])
 def test_np_broadcast_ops_on_misaligned_input_oneside(dtype, lead_dim, both_ways):
     shape = list(rand_shape_2d()) + [lead_dim]
     small_shape = [shape[0], shape[1], 1]
     if both_ways:
         # Broadcast in both ways [1, K, L] x [M, 1, 1]
         big_shape = [1, shape[1], lead_dim]
     else:
         big_shape = shape
     size = onp.product(shape)
     small_size = onp.product(small_shape)
     big_size = onp.product(big_shape)
     a = np.arange(5000)
     b = np.arange(5000)
     e = np.arange(5000)
     c = a[1:big_size + 1].reshape(tuple(big_shape))
     d = b[1:small_size + 1].reshape(tuple(small_shape))
     f = e[1:size + 1].reshape(tuple(shape))
     f[:] = c + d
     expected = c.asnumpy() + d.asnumpy()
     mx.nd.waitall()
     assert_almost_equal(f, expected)

 @use_np
 @pytest.mark.parametrize('num_batch', [1, 2])
 @pytest.mark.parametrize('num_channel_data', [4, 8])
 @pytest.mark.parametrize('num_deformable_group', [1, 2])
 @pytest.mark.parametrize('input_height', [5, 6])
 @pytest.mark.parametrize('input_width', [5, 6])
 @pytest.mark.parametrize('dilate', [(1, 1), (2, 2)])
 @pytest.mark.parametrize('grad_nodes', [['im_data'], ['offset_data'], ['weight']])
 def test_modulated_deformable_convolution(num_batch, num_channel_data, num_deformable_group,
                                           input_height, input_width, dilate, grad_nodes):
     output_height = input_height
     output_width = input_width
     im_data = np.random.rand(num_batch, num_channel_data, input_height, input_width)
     offset_data = \
         np.random.rand(num_batch, num_deformable_group * 3 * 3 * 2, output_height, output_width)\
         * 0.8 + 0.1
     mask_data = np.random.rand(num_batch, num_deformable_group * 3 * 3, output_height, output_width)
     mask_data = 0.5 * (1 + np.tanh(0.5 * mask_data)) # sigmoid
     weight = np.random.normal(0, 0.001, (num_channel_data, num_channel_data, 3, 3))
     bias = np.zeros(num_channel_data)

     im_data_var = mx.symbol.Variable(name="im_data").as_np_ndarray()
     offset_data_var = mx.symbol.Variable(name="offset_data").as_np_ndarray()
     mask_data_var = mx.symbol.Variable(name="mask_data").as_np_ndarray()
     weight_var = mx.symbol.Variable(name="weight").as_np_ndarray()
     bias_var = mx.symbol.Variable(name="bias").as_np_ndarray()
     op = mx.sym.npx.modulated_deformable_convolution(name='test_op', data=im_data_var,
                                                      offset=offset_data_var, mask=mask_data_var,
                                                      weight=weight_var, bias=bias_var,
                                                      num_filter=num_channel_data, pad=dilate,
                                                      kernel=(3, 3), stride=(1, 1), dilate=dilate,
                                                      num_deformable_group=num_deformable_group)
     if grad_nodes[0] == 'offset_data':
         # wider tolerance needed for coordinate differential
         rtol, atol = 1.0, 1e-2
     else:
         rtol, atol = 0.05, 1e-3


 @use_np
 def test_broadcast_like_different_types():
     x = mx.np.zeros((2, 1))
     y = mx.np.ones((2, 2))

     y = mx.np.array(y).astype('int32')
     z = mx.npx.broadcast_like(x, y, 1, 1)
     assert_almost_equal(z.asnumpy(), np.array([[0,0],[0,0]]))
     assert x.dtype == z.dtype


 @use_np
 def test_np_apply_along_axis_fallback():
     data = np.random.randint(-100, 100, (2, 3))
     axis = 1
     func1d = lambda x: x.mean()
     np_y = onp.apply_along_axis(func1d, 1, data.asnumpy())
     y1 = np.apply_along_axis(func1d, 1, data)
     y2 = np.apply_along_axis(func1d, 1, arr=data)
     assert_almost_equal(y1.asnumpy(), np_y)
     assert y1.asnumpy().dtype == np_y.dtype
     assert_almost_equal(y2.asnumpy(), np_y)
     assert y2.asnumpy().dtype == np_y.dtype


 def check_multihead_attention_selfatt(dtype):
     class TestSelfAtt1(mx.gluon.HybridBlock):
         def __init__(self):
             super().__init__()
             self.batch_size = 2
             self.qkv_length = 7  # length of a sequence
             self.qkv_dim = 9     # dimension of encoding
             self.num_heads = 3   # number of attention head
             self.head_dim = 5    # head size
             self.out_dim = 13 * self.num_heads
             self.qkv_units = self.num_heads * self.head_dim

             self.q_weight = Parameter('q_weight', shape=(self.qkv_units, self.qkv_dim),
                                       init=None, dtype=dtype, allow_deferred_init=True)
             self.k_weight = Parameter('k_weight', shape=(self.qkv_units, self.qkv_dim),
                                       init=None, dtype=dtype, allow_deferred_init=True)
             self.v_weight = Parameter('v_weight', shape=(self.qkv_units, self.qkv_dim),
                                       init=None, dtype=dtype, allow_deferred_init=True)
             self.q_bias = Parameter('q_bias', shape=(self.qkv_units,),
                                     init=None, dtype=dtype, allow_deferred_init=True)
             self.k_bias = Parameter('k_bias', shape=(self.qkv_units,),
                                     init=None, dtype=dtype, allow_deferred_init=True)
             self.v_bias = Parameter('v_bias', shape=(self.qkv_units,),
                                     init=None, dtype=dtype, allow_deferred_init=True)
             self.out_weight = Parameter('out_weight', shape=(self.out_dim, self.qkv_units),
                                         init=None, dtype=dtype, allow_deferred_init=True)
             self.out_bias = Parameter('out_bias', shape=(self.out_dim,),
                                       init=None, dtype=dtype, allow_deferred_init=True)

         def forward(self, qkv):
             device = qkv.device
             qkv_weight = self.convert_weight(self.q_weight.data().to_device(device),
                                              self.k_weight.data().to_device(device),
                                              self.v_weight.data().to_device(device),
                                              self.num_heads)
             qkv_bias = self.convert_bias(self.q_bias.data().to_device(device),
                                          self.k_bias.data().to_device(device),
                                          self.v_bias.data().to_device(device),
                                          self.num_heads)
             qkv = np.transpose(qkv, axes=(1, 0, 2))
             qkv_proj = npx.fully_connected(qkv, weight=qkv_weight, bias=qkv_bias, flatten=False,
                                            num_hidden=self.qkv_units * 3, no_bias=False)
             att_score = npx.interleaved_matmul_selfatt_qk(qkv_proj, heads=self.num_heads)
             weighted_value = npx.interleaved_matmul_selfatt_valatt(qkv_proj, att_score, heads=self.num_heads)
             output = npx.fully_connected(weighted_value, weight=self.out_weight.data().to_device(device),
                                          bias=self.out_bias.data().to_device(device), flatten=False,
                                          num_hidden=self.out_dim, no_bias=False)
             return np.transpose(output, axes=(1, 0, 2)), att_score

         def convert_weight(self, q_weight, k_weight, v_weight, num_heads):
             q_weight = npx.reshape(q_weight, (num_heads, -1, -2), reverse=True)
             k_weight = npx.reshape(k_weight, (num_heads, -1, -2), reverse=True)
             v_weight = npx.reshape(v_weight, (num_heads, -1, -2), reverse=True)
             all_weights = np.concatenate([q_weight, k_weight, v_weight], axis=-2)
             all_weights = npx.reshape(all_weights, (-1, -2), reverse=True)
             return all_weights

         def convert_bias(self, q_bias, k_bias, v_bias, num_heads):
             q_bias = npx.reshape(q_bias, (num_heads, -1))
             k_bias = npx.reshape(k_bias, (num_heads, -1))
             v_bias = npx.reshape(v_bias, (num_heads, -1))
             all_bias = np.stack([q_bias, k_bias, v_bias], axis=1)
             all_bias = npx.reshape(all_bias, (-1,))
             return all_bias

     class TestSelfAtt2(mx.gluon.HybridBlock):
         def __init__(self):
             super().__init__()
             self.batch_size = 2
             self.qkv_length = 7  # length of a sequence
             self.qkv_dim = 9     # dimension of encoding
             self.num_heads = 3   # number of attention head
             self.head_dim = 5    # head size
             self.out_dim = 13 * self.num_heads
             self.qkv_units = self.num_heads * self.head_dim

             self.q_weight = Parameter('q_weight', shape=(self.qkv_units, self.qkv_dim),
                                       init=None, dtype=dtype, allow_deferred_init=True)
             self.k_weight = Parameter('k_weight', shape=(self.qkv_units, self.qkv_dim),
                                       init=None, dtype=dtype, allow_deferred_init=True)
             self.v_weight = Parameter('v_weight', shape=(self.qkv_units, self.qkv_dim),
                                       init=None, dtype=dtype, allow_deferred_init=True)
             self.q_bias = Parameter('q_bias', shape=(self.qkv_units,),
                                     init=None, dtype=dtype, allow_deferred_init=True)
             self.k_bias = Parameter('k_bias', shape=(self.qkv_units,),
                                     init=None, dtype=dtype, allow_deferred_init=True)
             self.v_bias = Parameter('v_bias', shape=(self.qkv_units,),
                                     init=None, dtype=dtype, allow_deferred_init=True)
             self.out_weight = Parameter('out_weight', shape=(self.out_dim, self.qkv_units),
                                         init=None, dtype=dtype, allow_deferred_init=True)
             self.out_bias = Parameter('out_bias', shape=(self.out_dim,),
                                       init=None, dtype=dtype, allow_deferred_init=True)

         def forward(self, qkv):
             device = qkv.device
             q = npx.fully_connected(qkv, weight=self.q_weight.data().to_device(device),
                                     bias=self.q_bias.data().to_device(device), flatten=False,
                                     num_hidden=self.qkv_units, no_bias=False)
             k = npx.fully_connected(qkv, weight=self.k_weight.data().to_device(device),
                                     bias=self.k_bias.data().to_device(device), flatten=False,
                                     num_hidden=self.qkv_units, no_bias=False)
             v = npx.fully_connected(qkv, weight=self.v_weight.data().to_device(device),
                                     bias=self.v_bias.data().to_device(device), flatten=False,
                                     num_hidden=self.qkv_units, no_bias=False)
             q = npx.reshape(q, (-2, -2, self.num_heads, -1))
             q = np.transpose(q, axes=(0, 2, 1, 3))
             q = npx.reshape(q, (-1, -2, -2), reverse=True)
             k = npx.reshape(k, (-2, -2, self.num_heads, -1))
             k = np.transpose(k, axes=(0, 2, 1, 3))
             k = npx.reshape(k, (-1, -2, -2), reverse=True)
             q = q / np.sqrt(q.shape[-1])
             qkv = np.transpose(qkv, axes=(1, 0, 2))
             att_score = npx.batch_dot(q, k, transpose_b=True)

             v = npx.reshape(v, (-2, -2, self.num_heads, -1))
             v = np.transpose(v, axes=(0, 2, 1, 3))
             v = npx.reshape(v, (-1, -2, -2), reverse=True)
             weighted_value = npx.batch_dot(att_score, v)
             weighted_value = npx.reshape(weighted_value, (-1, self.num_heads, -2, -2),
                                          reverse=True)
             weighted_value = np.transpose(weighted_value, axes=(0, 2, 1, 3))
             weighted_value = npx.reshape(weighted_value, (-2, -2, -1))
             output = npx.fully_connected(weighted_value, weight=self.out_weight.data().to_device(device),
                                          bias=self.out_bias.data().to_device(device), flatten=False,
                                          num_hidden=self.out_dim, no_bias=False)
             return output, att_score

     qkv = np.random.uniform(size=(2, 7, 9), dtype=dtype)
     block1 = TestSelfAtt1()
     block2 = TestSelfAtt2()
     block1.initialize()
     block2.initialize()
     params1 = block1.collect_params()
     params2 = block2.collect_params()
     orig_params1 = copy.deepcopy(params1)
     for key, val in orig_params1.items():
         params2[key].set_data(copy.deepcopy(val.data()))
     block1.hybridize()
     block2.hybridize()
     with mx.autograd.record():
         out1, att_score1 = block1(qkv)
     out1.backward()
     with mx.autograd.record():
         out2, att_score2 = block2(qkv)
     out2.backward()
     grads1 = {k : v for k, v in params1.items()}
     grads2 = {k : v for k, v in params2.items()}
     assert_allclose(att_score1.asnumpy(), att_score2.asnumpy(), rtol=1e-2, atol=1e-3)
     assert_allclose(out1.asnumpy(), out2.asnumpy(), rtol=1e-2, atol=1e-3)

     for k in grads1.keys():
         assert(grads1[k].data().dtype == grads2[k].data().dtype)
         assert(grads1[k].data().shape == grads2[k].data().shape)
         assert_allclose(grads1[k].data().asnumpy(), grads2[k].data().asnumpy(), rtol=1e-2, atol=1e-3)


 @use_np
 @assert_raises_cuda_not_satisfied(min_version='9.1')
 @pytest.mark.serial
 def test_multihead_attention_selfatt():
     dtypes = ['float32']
     if mx.device.current_device().device_type == 'gpu':
         dtypes += ['float16']

     for dtype in dtypes:
         check_multihead_attention_selfatt(dtype=dtype)


 def check_multihead_attention_encdec(dtype):
     class TestSelfAtt1(mx.gluon.HybridBlock):
         def __init__(self):
             super().__init__()
             self.batch_size = 2
             self.qkv_length = 7  # length of a sequence
             self.qkv_dim = 9     # dimension of encoding
             self.num_heads = 3   # number of attention head
             self.head_dim = 5    # head size
             self.out_dim = 13 * self.num_heads
             self.qkv_units = self.num_heads * self.head_dim

             self.q_weight = Parameter('q_weight', shape=(self.qkv_units, self.qkv_dim),
                                       init=None, dtype=dtype, allow_deferred_init=True)
             self.k_weight = Parameter('k_weight', shape=(self.qkv_units, self.qkv_dim),
                                       init=None, dtype=dtype, allow_deferred_init=True)
             self.v_weight = Parameter('v_weight', shape=(self.qkv_units, self.qkv_dim),
                                       init=None, dtype=dtype, allow_deferred_init=True)
             self.q_bias = Parameter('q_bias', shape=(self.qkv_units,),
                                     init=None, dtype=dtype, allow_deferred_init=True)
             self.k_bias = Parameter('k_bias', shape=(self.qkv_units,),
                                     init=None, dtype=dtype, allow_deferred_init=True)
             self.v_bias = Parameter('v_bias', shape=(self.qkv_units,),
                                     init=None, dtype=dtype, allow_deferred_init=True)
             self.out_weight = Parameter('out_weight', shape=(self.out_dim, self.qkv_units),
                                         init=None, dtype=dtype, allow_deferred_init=True)
             self.out_bias = Parameter('out_bias', shape=(self.out_dim,),
                                       init=None, dtype=dtype, allow_deferred_init=True)

         def forward(self, q, kv):
             device = kv.device
             kv_weight = self.convert_weight(self.k_weight.data().to_device(device),
                                             self.v_weight.data().to_device(device),
                                             self.num_heads)
             kv_bias = self.convert_bias(self.k_bias.data().to_device(device),
                                         self.v_bias.data().to_device(device),
                                         self.num_heads)
             kv = np.transpose(kv, axes=(1, 0, 2))
             kv_proj = npx.fully_connected(kv, weight=kv_weight, bias=kv_bias, flatten=False,
                                           num_hidden=self.qkv_units * 2, no_bias=False)
             q = np.transpose(q, axes=(1, 0, 2))
             q_proj = npx.fully_connected(q, weight=self.q_weight.data().to_device(device),
                                          bias=self.q_bias.data().to_device(device), flatten=False,
                                          num_hidden=self.qkv_units, no_bias=False)
             att_score = npx.interleaved_matmul_encdec_qk(q_proj, kv_proj, heads=self.num_heads)
             weighted_value = npx.interleaved_matmul_encdec_valatt(kv_proj, att_score, heads=self.num_heads)
             output = npx.fully_connected(weighted_value, weight=self.out_weight.data().to_device(device),
                                          bias=self.out_bias.data().to_device(device), flatten=False,
                                          num_hidden=self.out_dim, no_bias=False)
             return np.transpose(output, axes=(1, 0, 2)), att_score

         def convert_weight(self, k_weight, v_weight, num_heads):
             k_weight = npx.reshape(k_weight, (num_heads, -1, -2), reverse=True)
             v_weight = npx.reshape(v_weight, (num_heads, -1, -2), reverse=True)
             all_weights = np.concatenate([k_weight, v_weight], axis=-2)
             all_weights = npx.reshape(all_weights, (-1, -2), reverse=True)
             return all_weights

         def convert_bias(self, k_bias, v_bias, num_heads):
             k_bias = npx.reshape(k_bias, (num_heads, -1))
             v_bias = npx.reshape(v_bias, (num_heads, -1))
             all_bias = np.stack([k_bias, v_bias], axis=1)
             all_bias = npx.reshape(all_bias, (-1,))
             return all_bias

     class TestSelfAtt2(mx.gluon.HybridBlock):
         def __init__(self):
             super().__init__()
             self.batch_size = 2
             self.qkv_length = 7  # length of a sequence
             self.qkv_dim = 9     # dimension of encoding
             self.num_heads = 3   # number of attention head
             self.head_dim = 5    # head size
             self.out_dim = 13 * self.num_heads
             self.qkv_units = self.num_heads * self.head_dim

             self.q_weight = Parameter('q_weight', shape=(self.qkv_units, self.qkv_dim),
                                       init=None, dtype=dtype, allow_deferred_init=True)
             self.k_weight = Parameter('k_weight', shape=(self.qkv_units, self.qkv_dim),
                                       init=None, dtype=dtype, allow_deferred_init=True)
             self.v_weight = Parameter('v_weight', shape=(self.qkv_units, self.qkv_dim),
                                       init=None, dtype=dtype, allow_deferred_init=True)
             self.q_bias = Parameter('q_bias', shape=(self.qkv_units,),
                                     init=None, dtype=dtype, allow_deferred_init=True)
             self.k_bias = Parameter('k_bias', shape=(self.qkv_units,),
                                     init=None, dtype=dtype, allow_deferred_init=True)
             self.v_bias = Parameter('v_bias', shape=(self.qkv_units,),
                                     init=None, dtype=dtype, allow_deferred_init=True)
             self.out_weight = Parameter('out_weight', shape=(self.out_dim, self.qkv_units),
                                         init=None, dtype=dtype, allow_deferred_init=True)
             self.out_bias = Parameter('out_bias', shape=(self.out_dim,),
                                       init=None, dtype=dtype, allow_deferred_init=True)

         def forward(self, q, kv):
             device = kv.device
             q = npx.fully_connected(q, weight=self.q_weight.data().to_device(device),
                                     bias=self.q_bias.data().to_device(device), flatten=False,
                                     num_hidden=self.qkv_units, no_bias=False)
             k = npx.fully_connected(kv, weight=self.k_weight.data().to_device(device),
                                     bias=self.k_bias.data().to_device(device), flatten=False,
                                     num_hidden=self.qkv_units, no_bias=False)
             v = npx.fully_connected(kv, weight=self.v_weight.data().to_device(device),
                                     bias=self.v_bias.data().to_device(device), flatten=False,
                                     num_hidden=self.qkv_units, no_bias=False)
             q = npx.reshape(q, (-2, -2, self.num_heads, -1))
             q = np.transpose(q, axes=(0, 2, 1, 3))
             q = npx.reshape(q, (-1, -2, -2), reverse=True)
             k = npx.reshape(k, (-2, -2, self.num_heads, -1))
             k = np.transpose(k, axes=(0, 2, 1, 3))
             k = npx.reshape(k, (-1, -2, -2), reverse=True)
             q = q / np.sqrt(q.shape[-1])
             att_score = npx.batch_dot(q, k, transpose_b=True)

             v = npx.reshape(v, (-2, -2, self.num_heads, -1))
             v = np.transpose(v, axes=(0, 2, 1, 3))
             v = npx.reshape(v, (-1, -2, -2), reverse=True)
             weighted_value = npx.batch_dot(att_score, v)
             weighted_value = npx.reshape(weighted_value, (-1, self.num_heads, -2, -2),
                                          reverse=True)
             weighted_value = np.transpose(weighted_value, axes=(0, 2, 1, 3))
             weighted_value = npx.reshape(weighted_value, (-2, -2, -1))
             output = npx.fully_connected(weighted_value, weight=self.out_weight.data().to_device(device),
                                          bias=self.out_bias.data().to_device(device), flatten=False,
                                          num_hidden=self.out_dim, no_bias=False)
             return output, att_score

     q = np.random.uniform(size=(2, 7, 9), dtype=dtype)
     kv = np.random.uniform(size=(2, 7, 9), dtype=dtype)
     block1 = TestSelfAtt1()
     block2 = TestSelfAtt2()
     block1.initialize()
     block2.initialize()
     params1 = block1.collect_params()
     params2 = block2.collect_params()
     orig_params1 = copy.deepcopy(params1)
     for key, val in orig_params1.items():
         params2[key].set_data(copy.deepcopy(val.data()))
     block1.hybridize()
     block2.hybridize()
     with mx.autograd.record():
         out1, att_score1 = block1(q, kv)
     out1.backward()
     with mx.autograd.record():
         out2, att_score2 = block2(q, kv)
     out2.backward()
     grads1 = {k : v for k, v in params1.items()}
     grads2 = {k : v for k, v in params2.items()}
     assert_allclose(att_score1.asnumpy(), att_score2.asnumpy(), rtol=1e-2, atol=1e-3)
     assert_allclose(out1.asnumpy(), out2.asnumpy(), rtol=1e-2, atol=1e-3)

     for k in grads1.keys():
         assert(grads1[k].data().dtype == grads2[k].data().dtype)
         assert(grads1[k].data().shape == grads2[k].data().shape)
         assert_allclose(grads1[k].data().asnumpy(), grads2[k].data().asnumpy(), rtol=1e-2, atol=1e-3)


 @use_np
 @assert_raises_cuda_not_satisfied(min_version='9.1')
 @pytest.mark.serial
 def test_multihead_attention_encdec():
     dtypes = ['float32']
     if mx.device.current_device().device_type == 'gpu':
         dtypes += ['float16']

     for dtype in dtypes:
         check_multihead_attention_encdec(dtype=dtype)


 @use_np
 def test_add_n():
     data_shape = (2, 2)
     input_num = 5
     data = [np.random.uniform(size=data_shape) for i in range(input_num)]
     rslt = np.zeros(shape=data_shape)
     for i in range(input_num):
         rslt += data[i]
     add_n_rslt = npx.add_n(*data, out=data[0])
     assert_almost_equal(rslt.asnumpy(), add_n_rslt.asnumpy(), atol=1e-5)


 @use_np
 def test_slice_like():
     for ndim in range(1, 6):
         from_shape = onp.random.randint(1, 11, size=(ndim,))
         shape = [s + onp.random.randint(0, 3) for s in from_shape]
         for t in range(ndim):
             if t > 0:
                 axes = onp.random.randint(0, ndim, size=t).tolist()
             else:
                 axes = []
             idx = []
             for i in range(ndim):
                 idx.append(slice(0, shape[i]))
                 if i in axes or not axes:
                     idx[i] = slice(0, from_shape[i])

             if axes:
                 pos = onp.random.randint(0, t)
                 if axes[pos] > 0:
                     axes[pos] -= ndim  # negative index
             x = np.array(onp.random.normal(size=shape))
             x1 = np.array(onp.random.normal(size=from_shape))
             x.attach_grad()
             x1.attach_grad()
             with mx.autograd.record():
                 y = npx.slice_like(data=x, shape_like=x1, axes=axes)
             y.backward()
             assert_allclose(x.asnumpy()[idx], y.asnumpy())

             xx = x.asnumpy()
             xx[:] = 0.0
             xx[idx] = x.asnumpy()[idx]
             assert_allclose(x1.grad.asnumpy(), np.zeros_like(x1.grad).asnumpy())


 @use_np
 @pytest.mark.parametrize('shape,num_filter,num_group,kernel,pad', [
     ((1, 4, 15), 16, 2, (2,), (0,)),
     ((8, 4, 16), 16, 1, (3,), (1,)),

     ((1, 4, 15, 16), 16, 2, (2, 2), (0, 0)),
     ((8, 4, 16, 16), 16, 1, (3, 3), (1, 1)),

     ((1, 4, 3, 15, 16), 16, 2, (2, 2, 2), (0, 0, 0)),
     ((8, 4, 3, 16, 16), 16, 1, (3, 3, 3), (1, 1, 1))])
 def test_npx_deconvolution(shape, num_filter, num_group, kernel, pad):
     if len(kernel) == 3 and mx.current_device().device_type == 'gpu':
         pytest.skip('Skipping deconvoluition 3D tests for GPU')

     class TestConv(mx.gluon.HybridBlock):
         def __init__(self, w):
             super().__init__()
             self.weight = w

         def forward(self, x, *args):
             return npx.convolution(x, self.weight.data(x.device), no_bias=True, kernel=kernel,
                                    pad=pad, num_filter=self.weight.shape[0], num_group=num_group)

     class TestDeconv(mx.gluon.HybridBlock):
         def __init__(self):
             super().__init__()
             self.weight = mx.gluon.Parameter('weight', shape=(shape[1], int(num_filter/num_group),
                                                               *kernel))
             self.bias = mx.gluon.Parameter('bias', shape=num_filter)

         def forward(self, x, *args):
             return npx.deconvolution(x, self.weight.data(x.device), self.bias.data(x.device), kernel,
                                      pad=pad, num_filter=num_filter, num_group=num_group)

     deconvNet = TestDeconv()
     deconvNet.initialize()

     # test imperative
     deconvData = np.random.uniform(0, 1, size=shape)
     npx_out_imp = deconvNet(deconvData)

     # test symbolic
     deconvNet.hybridize()
     deconvNet(deconvData)
     npx_out_sym = deconvNet(deconvData)
     assert_almost_equal(npx_out_imp, npx_out_sym)

     # compare outputs with reference tensors generated using convolution
     convNet = TestConv(deconvNet.weight)
     convNet.initialize()
     convData = np.random.uniform(0, 1, size=npx_out_imp.shape)
     convData.attach_grad()
     with mx.autograd.record():
         convOut = convNet(convData)
         y = np.reshape(convOut, -1)
         y = np.sum(y)
     y.backward()

     deconvData = np.ones_like(convOut)  # gradient of convOut
     deconvBias = np.repeat(deconvNet.bias.data(), int(np.prod(np.array(convData.grad.shape[2:])).item()))
     deconvRefOut = np.copy(convData.grad) + deconvBias.reshape((convData.grad.shape[1:]))
     deconvData.attach_grad()
     with mx.autograd.record():
         deconvOut = deconvNet(deconvData)
     deconvOut.backward()

     convData = np.ones_like(deconvOut)
     deconvRefGrad = convNet(convData)

     assert_almost_equal(deconvOut, deconvRefOut)
     assert_almost_equal(deconvData.grad, deconvRefGrad)


 @use_np
 @pytest.mark.parametrize('dtype', np.floating_dtypes)
 def test_np_finfo(dtype):
     mx_finfo_obj = np.finfo(dtype)
     np_finfo = onp.finfo(dtype)
     assert (mx_finfo_obj.bits, mx_finfo_obj.eps, mx_finfo_obj.max, mx_finfo_obj.min, mx_finfo_obj.smallest_normal) == \
         (np_finfo.bits, np_finfo.eps, np_finfo.max, np_finfo.min, np_finfo.tiny)


 @use_np
 @pytest.mark.parametrize('dtype', np.integer_dtypes)
 def test_np_iinfo(dtype):
     mx_iinfo_obj = np.iinfo(dtype)
     np_iinfo = onp.iinfo(dtype)
     assert (mx_iinfo_obj.bits, mx_iinfo_obj.max, mx_iinfo_obj.min) == \
         (np_iinfo.bits, np_iinfo.max, np_iinfo.min)


 @use_np
 @pytest.mark.parametrize('input1', [d for d in np.numeric_dtypes + np.boolean_dtypes] + [np.ones((1,), dtype=d) for d in np.numeric_dtypes + np.boolean_dtypes])
 @pytest.mark.parametrize('input2', [d for d in np.numeric_dtypes + np.boolean_dtypes])
 def test_np_can_cast(input1, input2):
     np_input1 = input1
     np_input2 = input2
     if isinstance(input1, np.ndarray):
         np_input1 = input1.asnumpy()
     assert np.can_cast(input1, input2) == onp.can_cast(np_input1, np_input2)


 @use_np
 @pytest.mark.parametrize('nums', [1, 2, 3, 4, 10, 100])
 def test_np_result_type(nums):
     PICK_LIST = np.numeric_dtypes + np.boolean_dtypes + [np.ones((1,), dtype=d) for d in np.numeric_dtypes + np.boolean_dtypes]
     import random
     inputs = [random.choice(PICK_LIST) for _ in range(nums)]

     try:
         promoted = np.result_type(*inputs)
     except Exception as e:
         with pytest.raises(TypeError):
             promoted = np.result_type(*inputs)


 @use_np
 @pytest.mark.parametrize('func,func2,dtypes,ref_grad,low,high', [
     ('abs', 'abs', 'numeric', lambda x: -1. * (x < 0) + (x > 0), -1.0, 1.0),
     ('acos', 'arccos', 'floating-point', lambda x: -1. / (1. - x ** 2.) ** (1. / 2.), -1.0, 1.0),
     ('acosh', 'arccosh', 'floating-point', lambda x: 1./(x**2 - 1.)**(1./2.), 2.0, 5.0),
     ('asin', 'arcsin', 'floating-point', lambda x: 1. / (1. - x ** 2) ** (1. / 2.), -1.0, 1.0),
     ('asinh', 'arcsinh', 'floating-point', lambda x: 1./(x**2 + 1.)**(1./2.), -1.0, 1.0),
     ('atan', 'arctan', 'floating-point', lambda x: 1. / (x ** 2. + 1.), -1.0, 1.0),
     ('atanh', 'arctanh', 'floating-point', lambda x: -1./(x**2 - 1.), -0.99, 0.99),
     ('bitwise_invert', 'invert', 'integer or boolean', None, -5, 5),
     ('ceil', 'ceil', 'numeric', None, -10.0, 10.0),
     ('cos', 'cos', 'floating-point', lambda x: -onp.sin(x), -1.0, 1.0),
     ('cosh', 'cosh', 'floating-point', lambda x: onp.sinh(x), -1.0, 1.0),
     ('exp', 'exp', 'floating-point', lambda x: onp.exp(x), -1.0, 1.0),
     ('expm1', 'expm1', 'floating-point', lambda x: onp.exp(x), -1.0, 1.0),
     ('floor', 'floor', 'numeric', None, -10.0, 10.0),
     ('log', 'log', 'floating-point', lambda x: 1.0 / x, 0.1, 5.0),
     ('log10', 'log10', 'floating-point', lambda x: 1.0 / (x * onp.log(10)), 0.1, 10.0),
     ('log1p', 'log1p', 'floating-point', lambda x: 1.0 / (1.0 + x), -0.9, 5.0),
     ('log2', 'log2', 'floating-point', lambda x: 1.0 / (x * onp.log(2)), 0.1, 2.0),
     ('logical_not', 'logical_not', 'boolean', None,  -1.0, 1.0),
     ('negative', 'negative', 'numeric', lambda x: -1. * onp.ones(x.shape), -1.0, 1.0),
     ('positive', 'positive', 'numeric', lambda x: onp.ones(x.shape), -1.0, 1.0),
     ('sign', 'sign', 'numeric', None, -1.0, 1.0),
     ('sin', 'sin', 'floating-point', lambda x: onp.cos(x), -1.0, 1.0),
     ('sinh', 'sinh', 'floating-point', lambda x: onp.cosh(x), -1.0, 1.0),
     ('sqrt', 'sqrt', 'floating-point', lambda x: 0.5 / onp.sqrt(x), 0.001, 10.0),
     ('square', 'square', 'numeric', lambda x: 2.0 * x, -1.0, 1.0),
     ('tan', 'tan', 'floating-point', lambda x: onp.tan(x) ** 2 + 1.0, -1.0, 1.0),
     ('tanh', 'tanh', 'floating-point', lambda x: 1. - onp.tanh(x) ** 2, -1.0, 1.0),
     ('trunc', 'trunc', 'numeric', None, -5.0, 5.0),
 ])
 @pytest.mark.parametrize('ndim', [2, 3, 4])
 def test_np_standard_unary_funcs(func, func2, dtypes, ref_grad, low, high, ndim):
     class TestStandardUnary(HybridBlock):
         def __init__(self, func):
             super(TestStandardUnary, self).__init__()
             self._func = func

         def forward(self, a):
             return getattr(np, self._func)(a)

     type_mapping = {
         'floating-point': np.floating_dtypes,
         'numeric': np.numeric_dtypes,
         'integer or boolean': np.integer_dtypes + np.boolean_dtypes,
         'boolean': np.boolean_dtypes,
     }

     def array_values(low, high, shape):
         for d in np.integer_dtypes + np.boolean_dtypes + np.floating_dtypes:
             yield onp.random.uniform(low, high, shape).astype(d), d


     shapes = [i for i in [rand_shape_nd(ndim, dim=3), (1, 0, 2)]]
     for shape in shapes:
         for (np_test_data, dtype) in array_values(low, high, shape):
             if dtype in type_mapping[dtypes]:
                 rtol = 1e-2 if dtype == np.float16 else 1e-3
                 atol = 1e-4 if dtype == np.float16 else 1e-5
                 # get rid of warning: divide by zero
                 if((func=='log' or func=='log10' or func=='log2') and
                     (dtype=='int8' or dtype=='uint8' or dtype=='int32' or
                     dtype=='int64')):
                     low = 1
                 if (func=='arctanh' and dtype=='bool'):
                     continue
                 np_func = getattr(onp, func2)
                 mx_func = TestStandardUnary(func)
                 mx_test_data = np.array(np_test_data, dtype=dtype)
                 for hybridize in [True, False]:
                     if hybridize:
                         mx_func.hybridize()
                     if ref_grad:
                         mx_test_data.attach_grad()
                     np_out = np_func(np_test_data)
                     with mx.autograd.record():
                         y = mx_func(mx_test_data)
                     assert y.shape == np_out.shape
                     assert_almost_equal(y.asnumpy(), np_out, rtol=1e-3, atol=atol)
                     if np_out.dtype == np.bool_:
                         assert y.dtype == np.bool_

                     if ref_grad and (dtype == 'float16' or dtype == 'float32' or dtype == 'float64'):
                         y.backward()
                         assert_almost_equal(mx_test_data.grad.asnumpy(), ref_grad(np_test_data), rtol=1e-1, atol=1e-2, equal_nan=True)

                 np_func = getattr(onp, func2)
                 mx_out = getattr(mx.np, func)(mx_test_data)
                 assert mx_out.shape == np_out.shape
                 assert np.result_type(mx_out) == dtype
                 assert_almost_equal(mx_out.asnumpy(), np_out, rtol=rtol, atol=1e-5)

                 assertRaises(NotImplementedError, getattr(np, func), mx_test_data, where=False)
                 assertRaises(NotImplementedError, getattr(np, func), mx_test_data, subok=False)
                 assertRaises(NotImplementedError, getattr(np, func), mx_test_data, dtype=onp.int8)
                 assertRaises(TypeError, getattr(np, func), mx_test_data, dtype="abcdefg")
                 assertRaises(NotImplementedError, getattr(np, func), mx_test_data, casting='safe')
                 assertRaises(TypeError, getattr(np, func), mx_test_data, casting='mxnet')
                 assertRaises(NotImplementedError, getattr(np, func), mx_test_data, order='C')
                 assertRaises(NotImplementedError, getattr(np, func), mx_test_data, order='mxnet')


 @use_np
 @pytest.mark.flaky
 @pytest.mark.parametrize('func,func2,promoted,dtypes,ref_grad_a,ref_grad_b,low,high', [
     ('add', 'add', True, 'numeric', lambda y, x1, x2: onp.ones(y.shape), None, -1.0, 1.0),
     ('atan2', 'arctan2', True, 'floating-point', lambda y, x1, x2: x2 / (onp.square(x1) + onp.square(x2)),
                                                  lambda y, x1, x2: -x1 / (onp.square(x1) + onp.square(x2)), -1, 1),
     ('bitwise_and', 'bitwise_and', True, 'integer or boolean', None, None, -100, 100),
     ('bitwise_or', 'bitwise_or', True, 'integer or boolean', None, None, -100, 100),
     ('bitwise_xor', 'bitwise_xor', True, 'integer or boolean', None, None, -100, 100),
     ('divide', 'divide', True, 'floating-point', lambda y, x1, x2: onp.ones(y.shape) / x2,
                                                  lambda y, x1, x2: -x1 / (x2 * x2), 0.1, 1.0),
     ('equal', 'equal', False, 'all', None, None, 0.0, 2.0),
     ('floor_divide', 'floor_divide', True, 'numeric', lambda y, x1, x2: onp.zeros(y.shape),
                                                       lambda y, x1, x2: onp.zeros(y.shape), 2.0, 10.0),
     ('greater', 'greater', False, 'numeric', None, None, 0.0, 2.0),
     ('greater_equal', 'greater_equal', False, 'numeric', None, None, 0.0, 2.0),
     ('less', 'less', False, 'numeric', None, None, 0.0, 2.0),
     ('less_equal', 'less_equal', False, 'numeric', None, None, 0.0, 2.0),
     ('logaddexp', 'logaddexp', True, 'floating-point', lambda y, x1, x2: onp.exp(x1) / (onp.exp(x1) + onp.exp(x2)),
                                                        lambda y, x1, x2: onp.exp(x2) / (onp.exp(x1) + onp.exp(x2)), -10, 10),
     ('logical_and', 'logical_and', False, 'boolean', None, None, -100, 100),
     ('logical_or', 'logical_or', False, 'boolean', None, None, -100, 100),
     ('logical_xor', 'logical_xor', False, 'boolean', None, None, -100, 100),
     ('multiply', 'multiply', True, 'numeric', lambda y, x1, x2: onp.broadcast_to(x2, y.shape),
                                               lambda y, x1, x2: onp.broadcast_to(x1, y.shape), -1.0, 1.0),
     ('not_equal', 'not_equal', False, 'all', None, None, 0.0, 2.0),
     ('pow', 'power', True, 'floating-point', lambda y, x1, x2: onp.power(x1, x2 - 1.0) * x2,
                                              lambda y, x1, x2: onp.power(x1, x2) * onp.log(x1), 1.0, 3.0),
     ('subtract', 'subtract', True, 'numeric', lambda y, x1, x2: onp.ones(y.shape),
                                               lambda y, x1, x2: -onp.ones(y.shape), -1.0, 1.0),
 ])
 @pytest.mark.parametrize('lshape,rshape', [
     ((3, 2), (3, 2)),
     ((3, 2), (3, 1)),
     ((3, 1), (3, 0)),
     ((0, 2), (1, 2)),
     ((2, 3, 4), (3, 1)),
 # MXNet numpy does not match original numpy behavior when broadcasting 0-dim arrays.
 # See https://github.com/apache/incubator-mxnet/issues/20898.
 #    ((2, 3), ()),
 #    ((), (2, 3))
     ((2, 3), (1,)),
     ((1,), (2, 3))
 ])
 def test_np_standard_binary_funcs(func, func2, promoted, dtypes, ref_grad_a, ref_grad_b, low, high, lshape, rshape):
     class TestStandardBinary(HybridBlock):
         def __init__(self, func):
             super(TestStandardBinary, self).__init__()
             self._func = func

         def forward(self, a, b,):
             return getattr(np, self._func)(a, b)

     type_mapping = {
         'floating-point': np.floating_dtypes,
         'numeric': np.numeric_dtypes,
         'integer or boolean': np.integer_dtypes + np.boolean_dtypes,
         'boolean': np.boolean_dtypes,
         'all': np.numeric_dtypes + np.boolean_dtypes,
     }

     def array_values(low, high, shape):
         for d in np.integer_dtypes + np.boolean_dtypes + np.floating_dtypes:
             yield onp.random.uniform(low, high, shape).astype(d), d


     for (left_value, ltype) in array_values(low, high, lshape):
         for (right_value, rtype) in array_values(low, high, rshape):
             if ltype in type_mapping[dtypes] and rtype in type_mapping[dtypes]:
                 try:
                     promote_type = np.result_type(ltype, rtype)
                 except Exception as e:
                     # Unkown type promotion between two types
                     continue
                 rtol = 1e-2 if ltype == np.float16 or rtype == np.float16 else 1e-3
                 atol = 1e-4 if ltype == np.float16 or rtype == np.float16 else 1e-5
                 mx_left_value = np.array(left_value, dtype=ltype)
                 mx_right_value = np.array(right_value, dtype=rtype)
                 mx_func = TestStandardBinary(func)
                 np_func = getattr(onp, func2)
                 for hybridize in [True, False]:
                     if hybridize:
                         mx_func.hybridize()
                     if ref_grad_a:
                         mx_left_value.attach_grad()
                         mx_right_value.attach_grad()
                     np_out = np_func(left_value, right_value)
                     with mx.autograd.record():
                         y = mx_func(mx_left_value, mx_right_value)
                     assert y.shape == np_out.shape
                     assert_almost_equal(y.asnumpy(), np_out.astype(y.dtype), rtol=rtol, atol=atol,
                                         use_broadcast=False, equal_nan=True)

                     if ref_grad_a and ltype in np.floating_dtypes and rtype in np.floating_dtypes:
                         y.backward()
                         assert_almost_equal(mx_left_value.grad.asnumpy(),
                                             collapse_sum_like(ref_grad_a(y.asnumpy(), left_value, right_value), mx_left_value.shape),
                                             rtol=1e-1, atol=1e-2, equal_nan=True, use_broadcast=False)
                         if ref_grad_b is None:
                             assert_almost_equal(mx_right_value.grad.asnumpy(),
                                                 collapse_sum_like(ref_grad_a(y.asnumpy(), right_value, left_value), mx_right_value.shape),
                                                 rtol=1e-1, atol=1e-2, equal_nan=True, use_broadcast=False)
                         else:
                             assert_almost_equal(mx_right_value.grad.asnumpy(),
                                                 collapse_sum_like(ref_grad_b(y.asnumpy(), left_value, right_value), mx_right_value.shape),
                                                 rtol=1e-1, atol=1e-2, equal_nan=True, use_broadcast=False)

                 np_out = getattr(onp, func2)(left_value, right_value)
                 mx_out = getattr(np, func)(mx_left_value, mx_right_value)
                 assert mx_out.shape == np_out.shape
                 if promoted:
                     assert np.result_type(ltype, rtype) == mx_out.dtype
                 else:
                     assert mx_out.dtype == np.bool_
                 assert_almost_equal(mx_out.asnumpy(), np_out.astype(mx_out.dtype), rtol=rtol, atol=atol,
                                     use_broadcast=False, equal_nan=True)