tests/python/unittest/test_sparse_operator.py - mxnet - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.

 from mxnet.test_utils import *
 from mxnet.base import MXNetError
 import pytest
 from common import assertRaises
 import random
 import warnings

 def is_scalar(var):
     return False if hasattr(var, "__len__") else True

 def get_result_type(call, dflt_stype):
     """Try to infer result storage type for a sparse matrix and a given unary operation"""
     if call is not None and dflt_stype != 'default':
         zero = np.zeros(([1]))
         result = do_normalize(call(zero))
         if not almost_equal(result, zero, equal_nan=True):
             expected_result_type = 'default'
         else:
             if dflt_stype is not None:
                 expected_result_type = dflt_stype;
             else:
                 expected_result_type = 'default'
     else:
         expected_result_type = 'default'

     return expected_result_type


 def get_result_type_with_scalar(call, dflt_stype):
     """Try to infer result storage type when operating a sparse matrices and a scalar"""
     if call is not None and dflt_stype != 'default':
         zero = np.zeros(([1]))
         result = call(zero, 5)

         if not almost_equal(result, zero, equal_nan=True):
             expected_result_type = 'default'
         else:
             if dflt_stype is not None:
                 expected_result_type = dflt_stype;
             else:
                 expected_result_type = 'default'
     else:
         expected_result_type = 'default'

     return expected_result_type


 def get_result_type_2(call, dflt_stype):
     """Try to infer result storage type when operating on two sparse matrices"""
     if call is not None and dflt_stype != 'default':
         zero = np.zeros(([1]))
         need_default = False
         for outer in [zero, np.ones(zero.shape)]:
             for inner in [zero, np.ones(zero.shape)]:
                 result = do_normalize(call(outer, inner))
                 if not almost_equal(result, zero, equal_nan=True):
                     need_default = True
                     break
             if need_default is True:
                 break

         if not need_default and dflt_stype is not None:
             expected_result_type = dflt_stype
         else:
             expected_result_type = 'default'
     else:
         expected_result_type = 'default'

     return expected_result_type


 def get_result_type_3(call, dflt_stype):
     """Try to infer result storage type when operating on three sparse matrices"""
     if call is not None and dflt_stype != 'default':
         zero = np.zeros(([1]))
         need_default = False
         for moon in [zero]:
             for outer in [zero]:
                 for inner in [zero]:
                     res_1, res_2 = call(moon, outer, inner)
                     result = do_normalize(res_1)
                     if not almost_equal(result, zero, equal_nan=True):
                         need_default = True
                         break
                     result = do_normalize(res_2)
                     if not almost_equal(result, zero, equal_nan=True):
                         need_default = True
                         break
                 if need_default is True:
                     break
             if need_default is True:
                 break

         if not need_default and dflt_stype is not None:
             expected_result_type = dflt_stype
         else:
             expected_result_type = 'default'
     else:
         expected_result_type = 'default'

     return expected_result_type


 def get_fw_bw_result_types(forward_numpy_call,  fwd_res_dflt,
                            backward_numpy_call, bwd_res_dflt):

     return (get_result_type(forward_numpy_call,  fwd_res_dflt),
             get_result_type(backward_numpy_call, bwd_res_dflt))


 def get_fw_bw_result_types_2(forward_numpy_call,  fwd_res_dflt,
                              backward_numpy_call, bwd_res_dflt):
     return (get_result_type(forward_numpy_call,  fwd_res_dflt),
             get_result_type_2(backward_numpy_call, bwd_res_dflt))

 def get_fw_bw_result_types_with_scalar(forward_numpy_call,  fwd_res_dflt,
                                        backward_numpy_call, bwd_res_dflt):
     return (get_result_type_with_scalar(forward_numpy_call,  fwd_res_dflt),
             get_result_type_with_scalar(backward_numpy_call, bwd_res_dflt))

 def gen_rsp_random_indices(shape, density=.5, force_indices=None):
     assert density >= 0 and density <= 1
     indices = set()
     if force_indices is not None:
         for val in force_indices:
             indices.add(int(val))
     if not np.isclose(density, .0, rtol=1.e-3, atol=1.e-3, equal_nan=True) and len(shape) > 0:
         row_count = shape[0]
         for i in range(row_count):
             r = random.uniform(0, 1)
             if r <= density and len(indices) < shape[0]:
                 indices.add(i)
     assert len(indices) <= shape[0]
     return list(indices)


 def all_zero(var):
     return 0

 @pytest.mark.skip(reason="https://github.com/apache/incubator-mxnet/issues/18740")
 def test_elemwise_binary_ops():
     # skip testing on GPU because only CPU ops are implemented
     if default_device().device_type is 'gpu':
         return

     def test_elemwise_binary_op(name, lhs_stype, rhs_stype, shape,
                                 forward_mxnet_call, forward_numpy_call, backward_numpy_call,
                                 lhs_grad_stype,
                                 rhs_grad_stype,
                                 expected_result_storage_type=None,
                                 modifier_func=None,
                                 lhs_density=.5,
                                 rhs_density=.5,
                                 force_lr_overlap=False,
                                 force_grad_overlap=False,
                                 ograd_density=0.0,
                                 skip_gradient_check=False,
                                 shuffle_csr_indices=True,
                                 verbose=False):
         if lhs_grad_stype is None:
             lhs_grad_stype = lhs_stype
         if rhs_grad_stype is None:
             rhs_grad_stype = rhs_stype

         lhs_grad_stype = get_result_type_3(backward_numpy_call, lhs_grad_stype)
         rhs_grad_stype = get_result_type_3(backward_numpy_call, rhs_grad_stype)

         if verbose is True:
             print("testing: {}  lhs={}, rhs={}, lhs_grad_stype={}, rhs_grad_stype={}"
                   .format(name, lhs_stype, rhs_stype, lhs_grad_stype, rhs_grad_stype))

         # Output type should be same as lvalue type, unless otherwise specified
         if expected_result_storage_type is None:
             if lhs_stype == 'default' or rhs_stype == 'default':
                 expected_result_storage_type = 'default'
             else:
                 expected_result_storage_type = lhs_stype

         lhs = mx.symbol.Variable('lhs', stype=lhs_stype)
         rhs = mx.symbol.Variable('rhs', stype=rhs_stype)

         grad_stypes = dict()
         grad_stypes['lhs'] = lhs_grad_stype
         grad_stypes['rhs'] = rhs_grad_stype

         if lhs_stype == 'default':
             lhs_nd = rand_ndarray(shape, 'default')
             if abs(lhs_density) < 1e-4:
                 func = all_zero
             else:
                 func = modifier_func
             lhs_nd = mx.nd.array(assign_each(lhs_nd.asnumpy(), func))
         else:
             lhs_nd = create_sparse_array_zd(
                 shape, lhs_stype, density=lhs_density,
                 modifier_func=modifier_func,
                 shuffle_csr_indices=shuffle_csr_indices,
                 rsp_indices=gen_rsp_random_indices(
                     shape,
                     density=lhs_density,
                     force_indices=[(shape[0]/2)] if force_lr_overlap is True else None
                 ))

         if rhs_stype == 'default':
             rhs_nd = rand_ndarray(shape, 'default')
             if abs(rhs_density) < 1e-4:
                 func = all_zero
             else:
                 func = modifier_func
             rhs_nd = mx.nd.array(assign_each(rhs_nd.asnumpy(), func))
         else:
             rhs_nd = create_sparse_array_zd(
                 shape, rhs_stype, density=rhs_density,
                 modifier_func=modifier_func,
                 shuffle_csr_indices=shuffle_csr_indices,
                 rsp_indices=gen_rsp_random_indices(
                     shape,
                     density=rhs_density,
                     force_indices=[(shape[0]/2)] if force_lr_overlap is True else None
                 ))

         lhs_np = lhs_nd.asnumpy()
         rhs_np = rhs_nd.asnumpy()

         if verbose is True:
             print("lhs input: {}".format(lhs_np))
             print("rhs input: {}".format(rhs_np))

         out_np = forward_numpy_call(lhs_np, rhs_np)

         if verbose is True:
             print("out_np: {}".format(out_np))

         test = forward_mxnet_call(lhs, rhs)

         location = {'lhs': lhs_nd, 'rhs': rhs_nd}

         outputs = check_symbolic_forward(test, location, [out_np], equal_nan=True)
         assert len(outputs) == 1
         assert outputs[0].stype == expected_result_storage_type

         if verbose is True:
             print ("mx forward output: ", outputs[0].asnumpy())
             print ("lhs_nd: ", lhs_nd.stype)
             print ("rhs_nd: ", rhs_nd.stype)
             print ("forward output: ", outputs[0].stype)

         if outputs[0].stype != 'default':
             out_grad = create_sparse_array_zd(
                 shape, outputs[0].stype, density=ograd_density,
                 data_init=1,
                 modifier_func=lambda x: 2,
                 shuffle_csr_indices=shuffle_csr_indices,
                 rsp_indices=gen_rsp_random_indices(
                     shape,
                     density=ograd_density,
                     force_indices=[(shape[0]/2)] if force_grad_overlap is True else None
                 ))
         else:
             if abs(ograd_density) < 1e-4:
                 out_grad = mx.nd.array(np.zeros(shape))
             else:
                 out_grad = mx.nd.array(np.ones(shape))


         out_grad_np = out_grad.asnumpy()

         if verbose is True:
             print("out_grad_np", out_grad_np)

         ingrad_lhs_np, ingrad_rhs_np = backward_numpy_call(out_grad_np, lhs_np, rhs_np)

         if verbose is True:
             print("out_grad", out_grad.asnumpy())
             print("ingrad_lhs_np", ingrad_lhs_np)
             print("ingrad_rhs_np", ingrad_rhs_np)

         igrads_result = check_symbolic_backward(test, location, [out_grad],
                                                 [ingrad_lhs_np, ingrad_rhs_np],
                                                 grad_stypes=grad_stypes,
                                                 equal_nan=True)

         if verbose is True:
             print("ingrad_lhs", igrads_result['lhs'].asnumpy())
             print("ingrad_rhs", igrads_result['rhs'].asnumpy())

         assert len(igrads_result) == 2

         if lhs_grad_stype is not None:
             assert igrads_result['lhs'].stype == lhs_grad_stype
         if rhs_grad_stype is not None:
             assert igrads_result['rhs'].stype == rhs_grad_stype
         if not skip_gradient_check:
             check_numeric_gradient(test, location,
                                    grad_stype_dict=grad_stypes)

     def check_all(l, r, check_function):
         assert l.shape == r.shape
         return check_function(l, r)

     def gt(l, r):
         return check_all(l, r, lambda a, b: a > b)

     def ge(l, r):
         return check_all(l, r, lambda a, b: a >= b)

     def lt(l, r):
         return check_all(l, r, lambda a, b: a < b)

     def le(l, r):
         return check_all(l, r, lambda a, b: a <= b)

     def elemwise_mul_stype(lstype, rstype):
         if lstype == rstype:
             return lstype
         elif lstype == 'default' and rstype == 'row_sparse':
             return 'row_sparse'
         elif lstype == 'row_sparse' and rstype == 'default':
             return 'row_sparse'
         else:
             return 'default'

     def elemwise_mul_lhs_grad_stype(lstype, rstype):
         return elemwise_mul_stype(elemwise_mul_stype(lstype, rstype), rstype)

     def elemwise_mul_rhs_grad_stype(lstype, rstype):
         return elemwise_mul_stype(elemwise_mul_stype(lstype, rstype), lstype)

     def check_elemwise_binary_ops(lhs_stype, rhs_stype, shape,
                                   lhs_grad_stype=None, rhs_grad_stype=None,
                                   lhs_density=.5, rhs_density=.5,
                                   force_lr_overlap=False,
                                   force_grad_overlap=False,
                                   ograd_density=0.0):
         test_elemwise_binary_op("elemwise_add", lhs_stype, rhs_stype, shape,
                                 lambda l, r: mx.sym.sparse.elemwise_add(l, r),
                                 lambda l, r: l + r,
                                 lambda outg, l, r: (outg, outg),
                                 lhs_grad_stype, rhs_grad_stype,
                                 ograd_density=ograd_density,
                                 force_lr_overlap=force_lr_overlap,
                                 force_grad_overlap=force_grad_overlap,
                                 lhs_density=lhs_density, rhs_density=rhs_density,
                                 verbose=False)

         if ((lhs_stype is 'default' and rhs_stype is 'row_sparse') or
             (lhs_stype is 'row_sparse' and rhs_stype is 'row_sparse') and (rhs_density == 0.0)):
             test_elemwise_binary_op("elemwise_add", lhs_stype, rhs_stype, shape,
                                     lambda l, r: mx.sym.sparse.elemwise_add(l, r, out=l),
                                     lambda l, r: l + r,
                                     lambda outg, l, r: (outg, outg),
                                     lhs_grad_stype, rhs_grad_stype,
                                     ograd_density=ograd_density,
                                     force_lr_overlap=force_lr_overlap,
                                     force_grad_overlap=force_grad_overlap,
                                     lhs_density=lhs_density, rhs_density=rhs_density,
                                     verbose=False)
             test_elemwise_binary_op("elemwise_sub", lhs_stype, rhs_stype, shape,
                                     lambda l, r: mx.sym.sparse.elemwise_sub(l, r, out=l),
                                     lambda l, r: l - r,
                                     lambda outg, l, r: (outg, -outg),
                                     lhs_grad_stype, rhs_grad_stype,
                                     ograd_density=ograd_density,
                                     force_lr_overlap=force_lr_overlap,
                                     force_grad_overlap=force_grad_overlap,
                                     lhs_density=lhs_density, rhs_density=rhs_density,
                                     verbose=False)
         if ((lhs_stype is 'row_sparse' and rhs_stype is 'row_sparse') and (lhs_density == 0.0)):
             test_elemwise_binary_op("elemwise_add", lhs_stype, rhs_stype, shape,
                                     lambda l, r: mx.sym.sparse.elemwise_add(l, r, out=r),
                                     lambda l, r: l + r,
                                     lambda outg, l, r: (outg, outg),
                                     lhs_grad_stype, rhs_grad_stype,
                                     ograd_density=ograd_density,
                                     force_lr_overlap=force_lr_overlap,
                                     force_grad_overlap=force_grad_overlap,
                                     lhs_density=lhs_density, rhs_density=rhs_density,
                                     verbose=False)
             test_elemwise_binary_op("elemwise_sub", lhs_stype, rhs_stype, shape,
                                     lambda l, r: mx.sym.sparse.elemwise_sub(l, r, out=l),
                                     lambda l, r: l - r,
                                     lambda outg, l, r: (outg, -outg),
                                     lhs_grad_stype, rhs_grad_stype,
                                     ograd_density=ograd_density,
                                     force_lr_overlap=force_lr_overlap,
                                     force_grad_overlap=force_grad_overlap,
                                     lhs_density=lhs_density, rhs_density=rhs_density,
                                     verbose=False)

         test_elemwise_binary_op("elemwise_sub", lhs_stype, rhs_stype, shape,
                                 lambda l, r: mx.sym.sparse.elemwise_sub(l, r),
                                 lambda l, r: l - r,
                                 lambda outg, l, r: (outg, -outg),
                                 lhs_grad_stype, rhs_grad_stype,
                                 ograd_density=ograd_density,
                                 force_lr_overlap=force_lr_overlap,
                                 force_grad_overlap=force_grad_overlap,
                                 lhs_density=lhs_density,
                                 rhs_density=rhs_density,
                                 verbose=False)

         test_elemwise_binary_op("elemwise_mul", lhs_stype, rhs_stype, shape,
                                 lambda l, r: mx.sym.sparse.elemwise_mul(l, r),
                                 lambda l, r: l * r,
                                 lambda outg, l, r: (outg * r, outg * l),
                                 elemwise_mul_lhs_grad_stype(lhs_stype, rhs_stype),
                                 elemwise_mul_rhs_grad_stype(lhs_stype, rhs_stype),
                                 expected_result_storage_type=elemwise_mul_stype(lhs_stype, rhs_stype),
                                 ograd_density=ograd_density,
                                 force_lr_overlap=force_lr_overlap,
                                 force_grad_overlap=force_grad_overlap,
                                 lhs_density=lhs_density, rhs_density=rhs_density,
                                 verbose=False)

         test_elemwise_binary_op("elemwise_div", lhs_stype, rhs_stype, shape,
                                 lambda l, r: mx.sym.sparse.elemwise_div(l, r),
                                 lambda l, r: l / r,
                                 lambda outg, l, r: (outg * (1/r), outg * (-l/(r*r))),
                                 lhs_grad_stype, rhs_grad_stype,
                                 modifier_func=lambda a: a if abs(a) > 0.25 else abs(a) + 1,
                                 force_lr_overlap=force_lr_overlap,
                                 force_grad_overlap=force_grad_overlap,
                                 lhs_density=lhs_density, rhs_density=rhs_density,
                                 ograd_density=ograd_density,
                                 expected_result_storage_type='default',
                                 skip_gradient_check=True,
                                 verbose=False)

         test_elemwise_binary_op("maximum", lhs_stype, rhs_stype, shape,
                                 lambda l, r: mx.sym._internal._maximum(l, r),
                                 lambda l, r: np.maximum(l, r),
                                 lambda outg, l, r: (outg * ge(l, r), outg * lt(l, r)),
                                 lhs_grad_stype, rhs_grad_stype,
                                 modifier_func=lambda a: a if abs(a) > 0.25 else abs(a) + 1,
                                 force_lr_overlap=force_lr_overlap,
                                 force_grad_overlap=force_grad_overlap,
                                 lhs_density=lhs_density, rhs_density=rhs_density,
                                 skip_gradient_check=True,
                                 ograd_density=ograd_density,
                                 verbose=False)

         test_elemwise_binary_op("minimum", lhs_stype, rhs_stype, shape,
                                 lambda l, r: mx.sym._internal._minimum(l, r),
                                 lambda l, r: np.minimum(l, r),
                                 lambda outg, l, r: (outg * le(l, r), outg * gt(l, r)),
                                 lhs_grad_stype, rhs_grad_stype,
                                 modifier_func=lambda a: a if abs(a) > 0.25 else abs(a) + 1,
                                 force_lr_overlap=force_lr_overlap,
                                 force_grad_overlap=force_grad_overlap,
                                 lhs_density=lhs_density, rhs_density=rhs_density,
                                 ograd_density=ograd_density,
                                 skip_gradient_check=True,
                                 verbose=False)

         test_elemwise_binary_op("hypot", lhs_stype, rhs_stype, shape,
                                 lambda l, r: mx.sym._internal._hypot(l, r),
                                 lambda l, r: np.hypot(l, r),
                                 lambda outg, l, r: (
                                     outg * assign_each2(
                                         l, r, lambda a, b: a/np.sqrt(a * a + b * b)),
                                     outg * assign_each2(
                                         l, r, lambda a, b: b/np.sqrt(a * a + b * b))
                                 ),
                                 lhs_grad_stype, rhs_grad_stype,
                                 force_lr_overlap=force_lr_overlap,
                                 force_grad_overlap=force_grad_overlap,
                                 lhs_density=lhs_density, rhs_density=rhs_density,
                                 ograd_density=ograd_density,
                                 skip_gradient_check=True,
                                 verbose=False)

     # Run basic tests
     with warnings.catch_warnings():
         warnings.simplefilter("ignore")

         for _ in range(1):
             # Run defaults
             check_elemwise_binary_ops('default', 'default', rand_shape_2d(5, 5))

             # Try different densities
             shape = rand_shape_2d(5, 5)
             for lhs_density in [0.0, random.uniform(0, 1), 1.0]:
                 for rhs_density in [0.0, random.uniform(0, 1), 1.0]:
                     for ograd_density in [0.0, random.uniform(0, 1), 1.0]:

                         print("lhs_density={}, rhs_density={}, ograd_density={}, shape: {}".format(
                             lhs_density, rhs_density, ograd_density, shape))

                         # Try row_sparse overlaps
                         for force_lr_overlap in [False, True]:
                             for force_grad_overlap in [False, True]:
                                 print("  force_lr_overlap={}, force_grad_overlap={}, shape={}".
                                       format(force_lr_overlap, force_grad_overlap, shape))

                                 # Back to left-right overlap possiblities
                                 check_elemwise_binary_ops('row_sparse', 'row_sparse', shape,
                                                           lhs_grad_stype='row_sparse',
                                                           rhs_grad_stype='row_sparse',
                                                           lhs_density=lhs_density,
                                                           rhs_density=rhs_density,
                                                           force_lr_overlap=force_lr_overlap,
                                                           force_grad_overlap=force_grad_overlap,
                                                           ograd_density=ograd_density)


 def test_elemwise_csr_same_zeros():
     # Zeroes
     a = mx.nd.sparse.zeros('csr', (1,1))
     b = mx.nd.elemwise_add(a,a)
     res = a.asnumpy() + a.asnumpy()
     assert_almost_equal(b.asnumpy(), res)


 def as_dense(arr):
     if arr.stype != 'default':
         return mx.nd.cast_storage(arr, stype='default')
     else:
         return arr;

 # Make sure that 0's look like 0's when we do a comparison
 def do_normalize(arr):
     ret = arr.copy()
     idx = np.isclose(arr, -0, rtol=1.e-3, atol=1.e-3, equal_nan=True)
     ret[idx] = 0
     return ret

 def check_sparse_mathematical_core(name, stype,
                                    forward_mxnet_call, forward_numpy_call, backward_numpy_call=None,
                                    rhs_arg=None, data_init=9., grad_init=2., output_grad_stype=None,
                                    input_grad_stype=None, force_overlap=False, density=.5,
                                    ograd_density=.5, verbose=False, shuffle_csr_indices=True):
     if verbose is True:
         print("TESTING: " + name)

     data = mx.symbol.Variable('data', stype=stype)

     temp_input_grad_stype = input_grad_stype

     if temp_input_grad_stype is None:
         temp_input_grad_stype = stype

     if rhs_arg is not None:
         if is_scalar(rhs_arg):
             expected_result_type, expected_grad_result_type = \
                 get_fw_bw_result_types_with_scalar(forward_numpy_call, stype,
                                                    backward_numpy_call, temp_input_grad_stype)
         else:
             expected_result_type, expected_grad_result_type = \
                 get_fw_bw_result_types_2(forward_numpy_call, stype,
                                          backward_numpy_call, temp_input_grad_stype)
     else:
         expected_result_type, expected_grad_result_type = \
             get_fw_bw_result_types(forward_numpy_call, stype,
                                    backward_numpy_call, temp_input_grad_stype)

     if input_grad_stype is not None and input_grad_stype != expected_grad_result_type:
         print("{}: explicit override of deduced input grad type '{}' with '{}'".format(
             name, expected_grad_result_type, input_grad_stype))
         expected_grad_result_type = input_grad_stype

     shape = rand_shape_2d()

     if verbose is True:
         print("Shape: ", shape, "density: ", density, "force_overlap", force_overlap)

     if stype == 'default':
         data_tmp = np.zeros(shape)
         if abs(density) >= 1e-4:
             data_tmp[:] = data_init
         arr_data = mx.nd.array(data_tmp)
     else:
         arr_data = create_sparse_array_zd(
             shape, stype, density=density,
             data_init=data_init,
             shuffle_csr_indices=shuffle_csr_indices,
             rsp_indices=gen_rsp_random_indices(
                 shape,
                 density=density,
                 force_indices=[(shape[0]/2)] if force_overlap is True else None
             )
         )
         data_tmp = arr_data.asnumpy()
         if verbose is True:
             print("arr_data indices", arr_data.indices.asnumpy())

     if verbose is True:
         print("input", data_tmp)

     if backward_numpy_call is None:
         arr_grad = None
     elif expected_grad_result_type == 'default':
         if abs(density) < 1e-4:
             arr_grad = mx.nd.zeros(shape)
         else:
             arr_grad = mx.nd.ones(shape)
     else:
         arr_grad = create_sparse_array_zd(
             shape,
             expected_grad_result_type,
             density=density,
             data_init=1,
             shuffle_csr_indices=shuffle_csr_indices,
             rsp_indices=gen_rsp_random_indices(
                 shape,
                 density=density,
                 force_indices=[(shape[0]/2)] if force_overlap is True else None
             )
         )

     if rhs_arg is not None:
         test = forward_mxnet_call(data, rhs_arg)
     else:
         test = forward_mxnet_call(data)

     args = list()
     args.append(arr_data)

     if arr_grad is not None:
         exe_test = test._bind(default_device(), args=args, args_grad=[arr_grad])
     else:
         exe_test = test._bind(default_device(), args=args)

     exe_test.forward(is_train=True)
     assert exe_test.outputs[0].stype == expected_result_type
     out = exe_test.outputs[0].asnumpy()

     if rhs_arg is not None:
         npout = forward_numpy_call(data_tmp, rhs_arg)
     else:
         npout = forward_numpy_call(data_tmp)

     if verbose is True:
         print("out", out)
         print("npout", npout)

     assert_almost_equal(out, npout, equal_nan=True)

     if backward_numpy_call is not None:
         if output_grad_stype == 'default' or output_grad_stype is None:
             out_grad = mx.nd.empty(shape)
             out_grad[:] = grad_init
         else:
             out_grad = create_sparse_array_zd(
                 shape, output_grad_stype,
                 density=density,
                 data_init=grad_init,
                 shuffle_csr_indices=shuffle_csr_indices,
                 rsp_indices=gen_rsp_random_indices(
                     shape,
                     density=ograd_density,
                     force_indices=[(shape[0]/2)] if force_overlap is True else None))

         npout_grad = out_grad.asnumpy()

         if verbose is True:
             print("npout_grad", npout_grad)

         if rhs_arg is not None:
             temp = backward_numpy_call(data_tmp, rhs_arg)
         else:
             temp = backward_numpy_call(data_tmp)
         input_grad = npout_grad * temp

         if verbose is True:
             print(arr_grad.asnumpy())
         exe_test.backward(out_grad)
         if verbose is True:
             print(arr_grad.asnumpy())

         assert arr_grad.stype == expected_grad_result_type

         if verbose is True:
             print(name)
             print("arr_grad", arr_grad.asnumpy())
             print("input_grad", input_grad)

         assert_almost_equal(arr_grad, input_grad, equal_nan=True)


 @pytest.mark.serial
 @pytest.mark.skip(reason='https://github.com/apache/incubator-mxnet/issues/18829')
 def test_sparse_mathematical_core():
     def util_sign(a):
         if np.isclose(a, -0, rtol=1.e-3, atol=1.e-3, equal_nan=True):
             return 0
         elif np.isclose(a, 0, rtol=1.e-3, atol=1.e-3, equal_nan=True):
             return 0
         elif a < 0.0:
             return -1
         else:  # a > 0.0:
             return 1

     # Check scalar binary operators
     def check_binary_op_with_scalar(stype,
                                     output_grad_stype=None,
                                     input_grad_stype=None,
                                     density=.5, ograd_density=.5,
                                     force_overlap=False,):
         # mul_scalar
         check_sparse_mathematical_core("mul_scalar", stype,
                                        lambda x, y: x * y,
                                        lambda x, y: x * y,
                                        lambda input, rhs: rhs,
                                        rhs_arg=5.0,
                                        data_init=2, grad_init=3,
                                        output_grad_stype=output_grad_stype,
                                        input_grad_stype=input_grad_stype,
                                        density=density, ograd_density=ograd_density,
                                        force_overlap=force_overlap,
                                        verbose=False)

         # plus_scalar
         check_sparse_mathematical_core("plus_scalar", stype,
                                        lambda x, y: x + y,
                                        lambda x, y: x + y,
                                        lambda input, rhs: 1,
                                        rhs_arg=5.0,
                                        data_init=2, grad_init=3,
                                        output_grad_stype=output_grad_stype,
                                        input_grad_stype=input_grad_stype,
                                        density=density, ograd_density=ograd_density,
                                        force_overlap=force_overlap,
                                        verbose=False)

         # minus_scalar
         check_sparse_mathematical_core("minus_scalar", stype,
                                        lambda x, y: x - y,
                                        lambda x, y: x - y,
                                        lambda input, rhs: 1,
                                        rhs_arg=5.0,
                                        data_init=2, grad_init=3,
                                        output_grad_stype=output_grad_stype,
                                        input_grad_stype=input_grad_stype,
                                        density=density, ograd_density=ograd_density,
                                        force_overlap=force_overlap,
                                        verbose=False)

     # Check many basic unary operators
     def check_mathematical_core(stype, output_grad_stype=None,
                                 input_grad_stype=None, force_overlap=False,
                                 density=.5, ograd_density=.5):

         # negative
         check_sparse_mathematical_core("negative", stype,
                                        lambda x: mx.sym.sparse.negative(x),
                                        lambda x: np.negative(x),
                                        force_overlap=force_overlap,
                                        density=density,
                                        input_grad_stype=input_grad_stype,
                                        ograd_density=ograd_density)

         # square
         check_sparse_mathematical_core("square", stype,
                                        lambda x: mx.sym.sparse.square(x),
                                        lambda x: np.square(x),
                                        lambda x: 2 * x,
                                        output_grad_stype=output_grad_stype,
                                        input_grad_stype=input_grad_stype,
                                        force_overlap=force_overlap,
                                        density=density, ograd_density=ograd_density,
                                        verbose=False)

         # sqrt
         check_sparse_mathematical_core("sqrt", stype,
                                        lambda x: mx.sym.sparse.sqrt(x),
                                        lambda x: np.sqrt(x),
                                        lambda x: 1.0/(2.0 * np.sqrt(x)),
                                        output_grad_stype=output_grad_stype,
                                        input_grad_stype=input_grad_stype,
                                        force_overlap=force_overlap,
                                        density=density, ograd_density=ograd_density,
                                        verbose=False)

         # cbrt
         check_sparse_mathematical_core("cbrt", stype,
                                        lambda x: mx.sym.sparse.cbrt(x),
                                        lambda x: np.cbrt(x),
                                        lambda x: 1.0/(3.0 * np.cbrt(x) * np.cbrt(x)),
                                        output_grad_stype=output_grad_stype,
                                        input_grad_stype=input_grad_stype,
                                        force_overlap=force_overlap,
                                        density=density, ograd_density=ograd_density,
                                        verbose=False)

         # rint
         check_sparse_mathematical_core("rint", stype,
                                        lambda x: mx.sym.sparse.rint(x),
                                        lambda x: np.rint(x),
                                        force_overlap=force_overlap, density=density,
                                        input_grad_stype=input_grad_stype,
                                        ograd_density=ograd_density)

         # fix
         check_sparse_mathematical_core("fix", stype,
                                        lambda x: mx.sym.sparse.fix(x),
                                        lambda x: np.fix(x),
                                        force_overlap=force_overlap, density=density,
                                        input_grad_stype=input_grad_stype,
                                        ograd_density=ograd_density)

         # floor
         check_sparse_mathematical_core("floor", stype, lambda x: mx.sym.sparse.floor(x),
                                        lambda x: np.floor(x),
                                        force_overlap=force_overlap,
                                        input_grad_stype=input_grad_stype,
                                        density=density, ograd_density=ograd_density)

         # ceil
         check_sparse_mathematical_core("ceil", stype,
                                        lambda x: mx.sym.sparse.ceil(x),
                                        lambda x: np.ceil(x),
                                        force_overlap=force_overlap,
                                        input_grad_stype=input_grad_stype,
                                        density=density, ograd_density=ograd_density)

         # round
         check_sparse_mathematical_core("round", stype,
                                        lambda x: mx.sym.sparse.round(x),
                                        lambda x: np.round(x),
                                        force_overlap=force_overlap,
                                        input_grad_stype=input_grad_stype,
                                        density=density, ograd_density=ograd_density)

         # trunc
         check_sparse_mathematical_core("trunc", stype,
                                        lambda x: mx.sym.sparse.trunc(x),
                                        lambda x: np.trunc(x),
                                        force_overlap=force_overlap,
                                        input_grad_stype=input_grad_stype,
                                        density=density, ograd_density=ograd_density)

         # sign
         check_sparse_mathematical_core("sign", stype,
                                        lambda x: mx.sym.sparse.sign(x),
                                        lambda x: np.sign(x),
                                        lambda x: np.zeros(x.shape),
                                        output_grad_stype=output_grad_stype,
                                        force_overlap=force_overlap,
                                        density=density, ograd_density=ograd_density)

         # log1p
         check_sparse_mathematical_core("log1p", stype,
                                        lambda x: mx.sym.sparse.log1p(x),
                                        lambda x: np.log1p(x),
                                        lambda x: 1. / (1.0 + x),
                                        data_init=0.5, grad_init=0.5,
                                        output_grad_stype=output_grad_stype,
                                        input_grad_stype=input_grad_stype,
                                        force_overlap=force_overlap, density=density,
                                        ograd_density=ograd_density)

         # expm1
         check_sparse_mathematical_core("expm1", stype,
                                         lambda x: mx.sym.sparse.expm1(x),
                                         lambda x: np.expm1(x),
                                         lambda x: np.exp(x),
                                         data_init=0.5, grad_init=0.5,
                                         output_grad_stype=output_grad_stype,
                                         input_grad_stype=input_grad_stype,
                                         force_overlap=force_overlap, density=density,
                                         ograd_density=ograd_density)

         # sin
         check_sparse_mathematical_core("sin", stype,
                                        lambda x: mx.sym.sparse.sin(x),
                                        lambda x: np.sin(x),
                                        lambda x: np.cos(x),
                                        output_grad_stype=output_grad_stype,
                                        input_grad_stype=input_grad_stype,
                                        force_overlap=force_overlap,
                                        density=density, ograd_density=ograd_density)

         # tan
         check_sparse_mathematical_core("tan", stype,
                                        lambda x: mx.sym.sparse.tan(x),
                                        lambda x: np.tan(x),
                                        lambda x: np.tan(x) ** 2 + 1,
                                        output_grad_stype=output_grad_stype,
                                        input_grad_stype=input_grad_stype,
                                        density=density,
                                        ograd_density=ograd_density)

         # arcsin
         check_sparse_mathematical_core("arcsin", stype,
                                        lambda x: mx.sym.sparse.arcsin(x),
                                        lambda x: np.arcsin(x),
                                        lambda x: 1. / (1. - x ** 2) ** (1. / 2.),
                                        data_init=0.5, grad_init=0.5,
                                        output_grad_stype=output_grad_stype,
                                        input_grad_stype=input_grad_stype,
                                        force_overlap=force_overlap,
                                        density=density, ograd_density=ograd_density)

         # arctan
         check_sparse_mathematical_core("arctan", stype,
                                        lambda x: mx.sym.sparse.arctan(x),
                                        lambda x: np.arctan(x),
                                        lambda x: 1. / (x ** 2. + 1.),
                                        data_init=0.5, grad_init=0.5,
                                        output_grad_stype=output_grad_stype,
                                        input_grad_stype=input_grad_stype,
                                        force_overlap=force_overlap,
                                        density=density, ograd_density=ograd_density)

         # degrees
         check_sparse_mathematical_core("degrees", stype,
                                        lambda x: mx.sym.sparse.degrees(x),
                                        lambda x: np.degrees(x),
                                        lambda x: assign_each(x, lambda a: 180./np.pi),
                                        data_init=0.5, grad_init=0.5,
                                        output_grad_stype=output_grad_stype,
                                        input_grad_stype=input_grad_stype,
                                        force_overlap=force_overlap,
                                        density=density, ograd_density=ograd_density)

         # radians
         check_sparse_mathematical_core("radians", stype,
                                        lambda x: mx.sym.sparse.radians(x),
                                        lambda x: np.radians(x),
                                        lambda x: assign_each(x, lambda a: np.pi / 180.),
                                        data_init=0.6, grad_init=1,
                                        output_grad_stype=output_grad_stype,
                                        input_grad_stype=input_grad_stype,
                                        force_overlap=force_overlap,
                                        density=density, ograd_density=ograd_density)

         # sinh
         check_sparse_mathematical_core("sinh", stype,
                                        lambda x: mx.sym.sparse.sinh(x),
                                        lambda x: np.sinh(x),
                                        lambda x: np.cosh(x),
                                        output_grad_stype=output_grad_stype,
                                        input_grad_stype=input_grad_stype,
                                        force_overlap=force_overlap,
                                        density=density, ograd_density=ograd_density)

         # tanh
         check_sparse_mathematical_core("tanh", stype,
                                        lambda x: mx.sym.sparse.tanh(x),
                                        lambda x: np.tanh(x),
                                        lambda x: 1. - np.tanh(x) ** 2,
                                        data_init=0.5, grad_init=1,
                                        output_grad_stype=output_grad_stype,
                                        input_grad_stype=input_grad_stype,
                                        force_overlap=force_overlap, density=density,
                                        ograd_density=ograd_density)

         # arcsinh
         check_sparse_mathematical_core("arcsinh", stype,
                                        lambda x: mx.sym.sparse.arcsinh(x),
                                        lambda x: np.arcsinh(x),
                                        lambda x: 1./(x**2 + 1.)**(1./2.),
                                        output_grad_stype=output_grad_stype,
                                        input_grad_stype=input_grad_stype,
                                        force_overlap=force_overlap, density=density,
                                        ograd_density=ograd_density)

         # arctanh
         check_sparse_mathematical_core("arctanh", stype,
                                        lambda x: mx.sym.sparse.arctanh(x),
                                        lambda x: np.arctanh(x),
                                        lambda x: -1./(x**2 - 1.),
                                        data_init=0.5,
                                        output_grad_stype=output_grad_stype,
                                        input_grad_stype=input_grad_stype,
                                        force_overlap=force_overlap, density=density,
                                        ograd_density=ograd_density)

         # abs
         check_sparse_mathematical_core("abs", stype,
                                        lambda x: mx.sym.sparse.abs(x),
                                        lambda x: np.abs(x),
                                        lambda x: assign_each(x, function=util_sign),
                                        output_grad_stype=output_grad_stype,
                                        input_grad_stype=input_grad_stype,
                                        force_overlap=force_overlap,
                                        density=density, ograd_density=ograd_density)

         if stype != "csr":
             # rsqrt
             check_sparse_mathematical_core("rsqrt", stype,
                                            lambda x: mx.sym.sparse.rsqrt(x),
                                            lambda x: 1 / np.sqrt(x),
                                            lambda x: -(1.0 / (2.0 * x * np.sqrt(x))),
                                            output_grad_stype=output_grad_stype,
                                            input_grad_stype=input_grad_stype,
                                            force_overlap=force_overlap,
                                            density=density, ograd_density=ograd_density)

             # cos
             check_sparse_mathematical_core("cos", stype,
                                            lambda x: mx.sym.sparse.cos(x),
                                            lambda x: np.cos(x),
                                            lambda x: -np.sin(x),
                                            output_grad_stype=output_grad_stype,
                                            input_grad_stype=input_grad_stype,
                                            force_overlap=force_overlap,
                                            density=density, ograd_density=ograd_density)

             # arccos
             check_sparse_mathematical_core("arccos", stype,
                                            lambda x: mx.sym.sparse.arccos(x),
                                            lambda x: np.arccos(x),
                                            lambda x: -1. / (1. - x ** 2.) ** (1. / 2.),
                                            data_init=0.5, grad_init=0.5,
                                            output_grad_stype=output_grad_stype,
                                            input_grad_stype=input_grad_stype,
                                            force_overlap=force_overlap, density=density,
                                            ograd_density=ograd_density)

             # cosh
             check_sparse_mathematical_core("cosh", stype,
                                            lambda x: mx.sym.sparse.cosh(x),
                                            lambda x: np.cosh(x),
                                            lambda x: np.sinh(x),
                                            data_init=5, grad_init=5,
                                            output_grad_stype=output_grad_stype,
                                            input_grad_stype=input_grad_stype,
                                            force_overlap=force_overlap,
                                            density=density, ograd_density=ograd_density)

             # arccosh
             check_sparse_mathematical_core("arccosh", stype,
                                            lambda x: mx.sym.sparse.arccosh(x),
                                            lambda x: np.arccosh(x),
                                            lambda x: 1./(x**2 - 1.)**(1./2.),
                                            output_grad_stype=output_grad_stype,
                                            input_grad_stype=input_grad_stype,
                                            force_overlap=force_overlap, density=density,
                                            ograd_density=ograd_density)

             # log10
             check_sparse_mathematical_core("log10", stype,
                                            lambda x: mx.sym.sparse.log10(x),
                                            lambda x: np.log10(x),
                                            lambda x: 1. / (x * np.log(10.)),
                                            output_grad_stype=output_grad_stype,
                                            input_grad_stype=input_grad_stype,
                                            force_overlap=force_overlap, density=density,
                                            ograd_density=ograd_density)

             # log2
             check_sparse_mathematical_core("log2", stype,
                                            lambda x: mx.sym.sparse.log2(x),
                                            lambda x: np.log2(x),
                                            lambda x: 1. / (x * np.log(2.)),
                                            output_grad_stype=output_grad_stype,
                                            input_grad_stype=input_grad_stype,
                                            force_overlap=force_overlap, density=density,
                                            ograd_density=ograd_density)


             try:
                 from scipy import special as scipy_special
                 # On scipy v1.0, psi([0, -1, -2, -3, ...]) = [ inf, inf, inf, inf, ...]
                 # On scipy v1.1, psi([0, -1, -2, -3, ...]) = [-inf, nan, nan, nan, ...]
                 # Map the behavior of v1.1 psi() to that of v1.0 for ints <= 0 for consistency
                 scipy_psi = np.vectorize(lambda x: np.inf if float(x).is_integer() and x <= 0 else
                                          scipy_special.psi(x))
                 # gamma
                 check_sparse_mathematical_core("gamma", stype,
                                                lambda x: mx.sym.sparse.gamma(x),
                                                lambda x: scipy_special.gamma(x),
                                                lambda x: scipy_special.gamma(x) * scipy_psi(x),
                                                output_grad_stype=output_grad_stype,
                                                input_grad_stype=input_grad_stype,
                                                force_overlap=force_overlap,
                                                density=density, ograd_density=ograd_density)
                 # gammaln
                 check_sparse_mathematical_core("gammaln", stype,
                                                lambda x: mx.sym.sparse.gammaln(x),
                                                lambda x: scipy_special.gammaln(x),
                                                lambda x: scipy_psi(x),
                                                output_grad_stype=output_grad_stype,
                                                input_grad_stype=input_grad_stype,
                                                force_overlap=force_overlap,
                                                density=density, ograd_density=ograd_density)

             except ImportError:
                 print("Could not import scipy. Skipping unit tests for special functions")

     for i in range(1):
         print("pass", i)
         for density in [0.0, random.uniform(0, 1), 1.0]:
             for ograd_density in [0.0, random.uniform(0, 1), 1.0]:
                 for force_overlap in [False, True]:
                     print("{}, {}, {}".format(density, ograd_density, force_overlap))
                     with warnings.catch_warnings():
                         warnings.simplefilter("ignore")

                         # Check unary ops (unary fwd, binary bwd)
                         check_mathematical_core('default', force_overlap=force_overlap,
                                                 density=density, ograd_density=ograd_density)
                         check_mathematical_core('row_sparse', force_overlap=force_overlap,
                                                 density=density, ograd_density=ograd_density)
                         check_mathematical_core('row_sparse', output_grad_stype='default',
                                                 force_overlap=force_overlap,
                                                 density=density, ograd_density=ograd_density)
                         check_mathematical_core('row_sparse', output_grad_stype='row_sparse',
                                                 force_overlap=force_overlap,
                                                 density=density, ograd_density=ograd_density)
                         check_mathematical_core('csr', output_grad_stype='default',
                                                 force_overlap=force_overlap,
                                                 density=density, ograd_density=ograd_density)
                         check_mathematical_core('csr', output_grad_stype='csr',
                                                 force_overlap=force_overlap,
                                                 density=density, ograd_density=ograd_density)

                         # Check binary with scalar ops
                         check_binary_op_with_scalar('default',
                                                     density=density,
                                                     ograd_density=ograd_density,
                                                     force_overlap=force_overlap)
                         check_binary_op_with_scalar('row_sparse',
                                                     density=density,
                                                     ograd_density=ograd_density,
                                                     force_overlap=force_overlap)
                         check_binary_op_with_scalar('row_sparse', output_grad_stype='default',
                                                     density=density,
                                                     ograd_density=ograd_density,
                                                     force_overlap=force_overlap)
                         check_binary_op_with_scalar('row_sparse',
                                                     output_grad_stype='row_sparse',
                                                     density=density, ograd_density=ograd_density,
                                                     force_overlap=force_overlap)
                         check_binary_op_with_scalar('csr',
                                                     output_grad_stype='csr',
                                                     input_grad_stype='default',
                                                     density=density,
                                                     ograd_density=ograd_density,
                                                     force_overlap=force_overlap)
                         check_binary_op_with_scalar('csr',
                                                     output_grad_stype='csr',
                                                     input_grad_stype='csr',
                                                     density=density,
                                                     ograd_density=ograd_density,
                                                     force_overlap=force_overlap)
                         check_binary_op_with_scalar('csr',
                                                     output_grad_stype='default',
                                                     density=density,
                                                     ograd_density=ograd_density,
                                                     force_overlap=force_overlap)


 @pytest.mark.serial
 def test_elemwise_add_ex():
     def check_elemwise_add_ex(lhs_stype, rhs_stype, shape, lhs_grad_stype=None, rhs_grad_stype=None):
         lhs = mx.symbol.Variable('lhs', stype=lhs_stype)
         rhs = mx.symbol.Variable('rhs', stype=rhs_stype)
         lhs_nd = rand_ndarray(shape, lhs_stype)
         rhs_nd = rand_ndarray(shape, rhs_stype)
         lhs_np = lhs_nd.asnumpy()
         rhs_np = rhs_nd.asnumpy()

         out_np = lhs_np + rhs_np
         test = mx.symbol.sparse.elemwise_add(lhs, rhs)
         location = {'lhs': lhs_nd, 'rhs': rhs_nd}
         check_symbolic_forward(test, location, [out_np])
         check_numeric_gradient(test, location)
         grad_stypes = {}
         if lhs_grad_stype is not None and lhs_grad_stype != 'default':
             grad_stypes['lhs'] = lhs_grad_stype
         if rhs_grad_stype is not None and rhs_grad_stype != 'default':
             grad_stypes['rhs'] = rhs_grad_stype
         check_symbolic_backward(test, location, [out_np], [out_np, out_np],
                                 grad_stypes=grad_stypes)

     shapes = [rand_shape_2d(), rand_shape_3d()]
     for shape in shapes:
         check_elemwise_add_ex('default', 'default', shape)
         check_elemwise_add_ex('row_sparse', 'row_sparse', shape,
                               lhs_grad_stype='row_sparse', rhs_grad_stype='row_sparse')


 @pytest.mark.serial
 def test_cast_storage_ex():
     def check_cast_storage(shape, density, from_stype, to_stype, check_numeric_grad=True):
         x = mx.symbol.Variable('x', stype=from_stype)
         x_nd = rand_ndarray(shape, from_stype, density=density)
         x_np = x_nd.asnumpy()
         out_np = x_np
         test = mx.symbol.cast_storage(x, stype=to_stype)
         location = {'x': x_nd}
         check_symbolic_forward(test, location, [out_np])
         # consider disable the numeric grad check for gpu block kernel since the input is large
         if check_numeric_grad:
             check_numeric_gradient(test, location)
         grad_stypes = {'x': to_stype}
         check_symbolic_backward(test, location, [out_np], [out_np], grad_stypes=grad_stypes)

     density = [1.00, 0.50, 0.01]
     for d in density:
         shape_2d = rand_shape_2d()
         shape_3d = rand_shape_3d()
         check_cast_storage(shape_2d, d, 'csr', 'default')
         check_cast_storage(shape_2d, d, 'default', 'csr')
         check_cast_storage(shape_2d, d, 'csr', 'csr')
         check_cast_storage(shape_2d, d, 'row_sparse', 'default')
         check_cast_storage(shape_2d, d, 'default', 'row_sparse')
         check_cast_storage(shape_2d, d, 'row_sparse', 'row_sparse')
         check_cast_storage(shape_3d, d, 'row_sparse', 'default')
         check_cast_storage(shape_3d, d, 'default', 'row_sparse')
         check_cast_storage(shape_3d, d, 'row_sparse', 'row_sparse')
         for i in range(4, 6):
             shape = rand_shape_nd(i, 5)
             check_cast_storage(shape, d, 'default', 'row_sparse')
             check_cast_storage(shape, d, 'row_sparse', 'default')
         # Test specific gpu kernels
         if default_device().device_type is 'gpu':
             dim0 = rnd.randint(1, 10)
             # test gpu thread kernel
             check_cast_storage((dim0, rnd.randint(  1,   32)), d, 'default', 'csr')
             # test gpu warp   kernel
             check_cast_storage((dim0, rnd.randint( 32,  512)), d, 'default', 'csr')
             # test gpu block  kernel
             check_cast_storage((dim0, rnd.randint(512, 1024)), d, 'default', 'csr',
                                check_numeric_grad=False)
             # check race condition in block kernel
             check_cast_storage((200, 128 * 2 + 1), d, 'default', 'csr',
                                check_numeric_grad=False)
             # test gpu thread kernel
             check_cast_storage((dim0, rnd.randint(  1,   32)), d, 'default', 'row_sparse')
             # test gpu warp   kernel
             check_cast_storage((dim0, rnd.randint( 32,  512)), d, 'default', 'row_sparse')
             # test gpu block  kernel
             check_cast_storage((dim0, rnd.randint(512, 1024)), d, 'default', 'row_sparse',
                                check_numeric_grad=False)


 @pytest.mark.serial
 def test_sparse_dot():
     def test_infer_forward_stype(lhs_shape, rhs_shape, lhs_density, rhs_density, trans_a, trans_b):
         all_stypes = ["default", "csr", "row_sparse"]
         lhs_nd = rand_ndarray(lhs_shape, 'default', density=lhs_density)
         rhs_nd = rand_ndarray(rhs_shape, 'default', density=rhs_density)
         out_nd = mx.nd.dot(lhs_nd, rhs_nd, transpose_a=trans_a, transpose_b=trans_b)
         out_np = out_nd.asnumpy()
         for lhs_stype in all_stypes:
             for rhs_stype in all_stypes:
                 for forward_stype in all_stypes:
                     lhs = lhs_nd.tostype(lhs_stype)
                     rhs = rhs_nd.tostype(rhs_stype)
                     out = mx.nd.dot(lhs, rhs, forward_stype=forward_stype,
                                     transpose_a=trans_a, transpose_b=trans_b)
                     assert_almost_equal(out.tostype('default').asnumpy(), out_np, rtol=1e-3, atol=1e-4)
                     lhs_var = mx.symbol.Variable('lhs', stype=lhs_stype)
                     rhs_var = mx.symbol.Variable('rhs', stype=rhs_stype)
                     out = mx.symbol.sparse.dot(lhs_var, rhs_var,
                                                forward_stype=forward_stype,
                                                transpose_a=trans_a, transpose_b=trans_b)
                     location = {'lhs': lhs, 'rhs': rhs}
                     check_symbolic_forward(out, location, [out_np], rtol=1e-3, atol=1e-4)
     def test_dot_csr(lhs_shape, rhs_shape, rhs_stype, trans_lhs, lhs_density, rhs_density):
         lhs_nd = rand_ndarray(lhs_shape, 'csr', density=lhs_density, shuffle_csr_indices=False)
         lhs_dns = lhs_nd.tostype('default')
         rhs_nd = rand_ndarray(rhs_shape, rhs_stype, density=rhs_density)
         rhs_dns = rhs_nd if rhs_stype == 'default' else rhs_nd.tostype('default')

         out = mx.nd.dot(lhs_nd, rhs_nd, transpose_a=trans_lhs)
         out_dns = mx.nd.dot(lhs_dns, rhs_dns, transpose_a=trans_lhs)
         out_np = out_dns.asnumpy()
         assert_almost_equal(out.asnumpy(), out_np, rtol=1e-3, atol=1e-5)

         # test symbolic forward
         lhs = mx.symbol.Variable('lhs', stype='csr')
         rhs = mx.symbol.Variable('rhs', stype=rhs_stype)
         out = mx.symbol.sparse.dot(lhs, rhs, transpose_a=trans_lhs)
         location = {'lhs': lhs_nd, 'rhs': rhs_nd}
         check_symbolic_forward(out, location, [out_np], rtol=1e-3, atol=1e-4)

         # test symbolic backward
         backward_trans = not trans_lhs
         rhs_backward_grad = mx.nd.dot(lhs_dns, out_dns, transpose_a=backward_trans).asnumpy()
         expected = {'rhs': rhs_backward_grad}
         check_symbolic_backward(out, location, [out_np], expected,
                                 grad_req={'lhs': 'null', 'rhs': 'write'},
                                 rtol=1e-3, atol=1e-4)

     def test_dot_dns_csr(lhs_shape, rhs_shape, lhs_density, rhs_density, trans_lhs=False, trans_rhs=False):
         lhs_nd = rand_ndarray(lhs_shape, stype='default', density=lhs_density)
         rhs_nd = rand_ndarray(rhs_shape, stype='csr', density=rhs_density)
         rhs_dns = rhs_nd.tostype('default')

         if default_device() == mx.cpu():
             forward_stype = 'csr'
         else:
             forward_stype = 'default'
         out = mx.nd.sparse.dot(lhs_nd, rhs_nd, transpose_a=trans_lhs, transpose_b=trans_rhs, forward_stype=forward_stype)
         out_dns = mx.nd.dot(lhs_nd, rhs_dns, transpose_a=trans_lhs, transpose_b=trans_rhs, forward_stype=forward_stype)
         out_np = out_dns.asnumpy()
         assert_almost_equal(out.asnumpy(), out_np, rtol=1e-3, atol=1e-5)

         # test symbolic forward
         lhs = mx.symbol.Variable('lhs', stype='default')
         rhs = mx.symbol.Variable('rhs', stype='csr')
         out = mx.symbol.sparse.dot(lhs, rhs, transpose_a=trans_lhs, transpose_b=trans_rhs, forward_stype=forward_stype)
         location = {'lhs': lhs_nd, 'rhs': rhs_nd}
         check_symbolic_forward(out, location, [out_np], rtol=1e-3, atol=1e-4)

         if default_device() == mx.cpu():
             # test symbolic backward
             backward_trans = not trans_lhs
             rhs_backward_grad = mx.nd.dot(lhs_nd, out_dns, transpose_a=backward_trans).asnumpy()
             if trans_rhs is True:
                 rhs_backward_grad = rhs_backward_grad.T
             expected = {'rhs': rhs_backward_grad}
             check_symbolic_backward(out, location, [out_np], expected,
                                     grad_req={'lhs': 'null', 'rhs': 'write'},
                                     rtol=1e-3, atol=1e-4)
         else:
             transpose_b = not trans_rhs
             lhs_backward_grad = mx.nd.dot(out_dns, rhs_dns, transpose_b=transpose_b)
             expected = {'lhs': lhs_backward_grad.asnumpy()}
             check_symbolic_backward(out, location, [out_np], expected,
                                     grad_req={'lhs': 'write', 'rhs': 'null'},
                                     rtol=1e-3, atol=1e-4)

     def test_sparse_dot_zero_output(lhs_shape, trans_lhs, rhs_num_cols):
         """Test for nnr_out = 0. Before the fix, the test would fail."""
         lhs = mx.nd.zeros(lhs_shape)
         irow = np.random.randint(0, lhs_shape[0])
         icol = np.random.randint(0, lhs_shape[1])
         lhs[irow, icol] = 1.0
         if trans_lhs:
             rhs = rand_ndarray(shape=(lhs_shape[0], rhs_num_cols), stype='default')
             rhs[irow, :] = 0
         else:
             rhs = rand_ndarray(shape=(lhs_shape[1], rhs_num_cols), stype='default')
             rhs[icol, :] = 0
         dns_out = mx.nd.dot(lhs, rhs, transpose_a=trans_lhs)
         assert mx.nd.sum(mx.nd.abs(dns_out)).asscalar() == 0
         sps_out = mx.nd.sparse.dot(lhs.tostype('csr'), rhs.tostype('row_sparse'), transpose_a=trans_lhs)
         assert same(dns_out.asnumpy(), sps_out.asnumpy())

     density = [1.00, 0.5, 0.01]
     for lhs_d in density:
         lhs_shape = rand_shape_2d(50, 200)
         rhs_d = 1
         test_dot_csr(lhs_shape, (lhs_shape[1], 1), 'default', False, lhs_d, rhs_d)  # test gpu SpMV
         test_dot_csr(lhs_shape, (lhs_shape[0], 1), 'default', True,  lhs_d, rhs_d)  # (vector kernel)
         test_dot_csr(lhs_shape, (lhs_shape[1], rnd.randint(5, 10)), 'default', False, lhs_d, rhs_d)  # test gpu SpMM
         test_dot_csr(lhs_shape, (lhs_shape[0], rnd.randint(5, 10)), 'default', True, lhs_d, rhs_d)  # (scalar kernel)
         test_dot_dns_csr(lhs_shape, (lhs_shape[1], rnd.randint(50, 200)), lhs_d, lhs_d)
         test_dot_dns_csr(lhs_shape, (rnd.randint(50, 200), lhs_shape[1]), lhs_d, lhs_d, trans_rhs=True)
         for rhs_d in density:
             test_dot_csr(lhs_shape, (lhs_shape[1], rnd.randint(1, 10)), 'row_sparse', False, lhs_d, rhs_d)
             test_dot_csr(lhs_shape, (lhs_shape[0], rnd.randint(1, 10)), 'row_sparse', True, lhs_d, rhs_d)
             test_infer_forward_stype(lhs_shape, (lhs_shape[1], rnd.randint(10, 20)),
                                      lhs_d, rhs_d, False, False)
             test_infer_forward_stype(lhs_shape, (rnd.randint(10, 20), lhs_shape[1]),
                                      lhs_d, rhs_d, False, True)
             test_infer_forward_stype(lhs_shape, (lhs_shape[0], rnd.randint(10, 20)),
                                      lhs_d, rhs_d, True, False)
             test_infer_forward_stype(lhs_shape, (rnd.randint(10, 20), lhs_shape[0]),
                                      lhs_d, rhs_d, True, True)

     test_sparse_dot_zero_output(rand_shape_2d(50, 200), False, 40)
     test_sparse_dot_zero_output(rand_shape_2d(50, 200), True, 40)

 @pytest.mark.serial
 def test_sparse_dot_determinism():
     def check_dot_determinism(lhs_stype, rhs_stype, lhs_density, rhs_density, transpose_a, transpose_b, forward_stype):
         lhs_row = rnd.randint(50, 100)
         lhs_col = rnd.randint(50, 100)
         if transpose_a:
             if transpose_b:
                 rhs_shape = (rnd.randint(50, 100), lhs_row)
             else:
                 rhs_shape = (lhs_row, rnd.randint(50, 100))
         else:
             if transpose_b:
                 rhs_shape = (rnd.randint(50, 100), lhs_col)
             else:
                 rhs_shape = (lhs_col, rnd.randint(50, 100))
         lhs_shape = (lhs_row, lhs_col)
         lhs = rand_ndarray(lhs_shape, lhs_stype, density=lhs_density)
         rhs = rand_ndarray(rhs_shape, rhs_stype, density=rhs_density)
         res1 = mx.nd.sparse.dot(lhs, rhs, transpose_a=transpose_a, transpose_b=transpose_b, forward_stype=forward_stype)
         res2 = mx.nd.sparse.dot(lhs, rhs, transpose_a=transpose_a, transpose_b=transpose_b, forward_stype=forward_stype)
         assert_almost_equal(res1.asnumpy(), res2.asnumpy(), rtol=0.0, atol=0.0)

     check_dot_determinism('csr', 'default', 0.1, 1.0, True, False, 'row_sparse')
     forward_stype = 'csr' if default_device() == mx.cpu() else 'default'
     check_dot_determinism('default', 'csr', 1.0, 0.1, False, False, forward_stype)
     check_dot_determinism('default', 'csr', 1.0, 0.1, False, True, forward_stype)
     check_dot_determinism('csr', 'default', 0.1, 1.0, True, False, 'default')


 def test_sparse_slice():
     def check_csr_slice(shape, slice_input):
         storage_type = 'csr'
         B, _ = rand_sparse_ndarray(shape, storage_type)
         np = B.asnumpy()
         begin = rnd.randint(0, B.shape[0] - 1)
         end = rnd.randint(begin + 1, B.shape[0])
         nd_slice = mx.nd.crop(B, begin=begin, end=end)
         assert same(nd_slice.asnumpy(), np[begin:end]), (nd_slice.asnumpy(), np[begin:end])

     shape = (rnd.randint(7, 15), rnd.randint(1, 10))
     check_csr_slice(shape, True)
     check_csr_slice(shape, False)


 @pytest.mark.serial
 def test_sparse_retain():
     def check_sparse_retain(shape, density, index_type=np.int64):
         num_rows = shape[0]
         rsp, _ = rand_sparse_ndarray(shape=shape, stype='row_sparse', density=density)
         length = np.random.randint(1, num_rows + 1)
         idx = random_sample(list(range(0, num_rows)), length)
         idx.sort()
         dns = rsp.asnumpy()
         tensor_retained_expected = np.zeros(shape)
         for i in idx:
             tensor_retained_expected[i][:] = dns[i]
         indices = mx.nd.array(idx, dtype=index_type)
         rsp_retained = mx.nd.sparse.retain(rsp, indices=indices)
         assert same(tensor_retained_expected, rsp_retained.asnumpy())

         # check numeric gradient
         data = mx.symbol.Variable('data')
         idx = mx.symbol.Variable('indices')
         sym = mx.sym.sparse.retain(data=data, indices=idx)
         check_numeric_gradient(sym, [rsp, indices], grad_nodes=['data'],
                                grad_stype_dict={'data': 'row_sparse'})

     shape = rand_shape_2d()
     shape_3d = rand_shape_3d()
     densities = [0.01, 0.5, 1.0]
     index_types = [np.float32, np.int32, np.int64]
     for density in densities:
         for itype in index_types:
             check_sparse_retain(shape, density, itype)
             check_sparse_retain(shape_3d, density, itype)


 def test_sparse_unary_with_numerics():
     def check_sparse_simple(name, stype, mxnet_func, forward_numpy_call,
                             backward_numpy_call, output_grad_stype=None,
                             backward_is_use_output=False):
         if output_grad_stype is None:
             output_grad_stype = stype

         expected_result_type, expected_grad_result_type = \
             get_fw_bw_result_types_2(forward_numpy_call, stype, backward_numpy_call, output_grad_stype)
         if backward_is_use_output is True:
             expected_grad_result_type = expected_result_type

         shape = (3, 4)
         data = mx.symbol.Variable("data")

         grad_stypes = {'data' : expected_grad_result_type}

         y = mxnet_func(data)
         if stype == 'default':
             xa = np.random.uniform(low=-1.0, high=1.0, size=shape)
             xa_np = xa
         else:
             xa = create_sparse_array(shape, stype, data_init=None, rsp_indices=[1],
                                      modifier_func=lambda a: a - 0.5,
                                      shuffle_csr_indices=True)
             xa_np = xa.asnumpy()

         if output_grad_stype != 'default':
             out_grad = create_sparse_array(shape, output_grad_stype, data_init=None,
                                            rsp_indices=[1, 2],
                                            modifier_func=lambda a: a - 0.5,
                                            shuffle_csr_indices=True)
             out_grad_np = out_grad.asnumpy()
         else:
             out_grad_np = np.ones(xa.shape)
             out_grad = mx.nd.array(out_grad_np)

         output_np = forward_numpy_call(xa_np)
         input_grad_np = backward_numpy_call(output_np, out_grad_np)

         outputs = check_symbolic_forward(y, [xa], [output_np])
         output = outputs[0]

         assert output.stype == expected_result_type

         input_grad_dict = check_symbolic_backward(y, location=[xa], out_grads=[out_grad],
                                                   expected=[input_grad_np],
                                                   grad_stypes=grad_stypes)
         inp_grad = input_grad_dict["data"]

         assert inp_grad.stype == expected_grad_result_type

     def check_sparse_function(name, mxnet_func, forward_numpy_call, backward_numpy_call,
                               backward_is_use_output=False):
         check_sparse_simple(name, 'default', mxnet_func, forward_numpy_call, backward_numpy_call)
         for output_grad_stype in [None, "row_sparse", "default"]:
             check_sparse_simple(name, 'row_sparse', mxnet_func, forward_numpy_call, backward_numpy_call,
                                 output_grad_stype=output_grad_stype,
                                 backward_is_use_output=backward_is_use_output)

         for output_grad_stype in [None, "csr", "default"]:
             check_sparse_simple(name, 'csr', mxnet_func, forward_numpy_call, backward_numpy_call,
                                 output_grad_stype=output_grad_stype,
                                 backward_is_use_output=backward_is_use_output)

     check_sparse_function('relu',
                           lambda x: mx.sym.relu(x),
                           lambda x: np.maximum(x, 0.0),
                           lambda output, outg: outg * assign_each(output, lambda x: x > 0.0), backward_is_use_output=True)

     check_sparse_function('sigmoid',
                           lambda x: mx.sym.sigmoid(x),
                           lambda x: np.divide(1.0, (1.0 + np.exp(-x))),
                           lambda output, outg: outg * assign_each(output, lambda x: x * (1.0 - x)),
                           backward_is_use_output=True)


 @pytest.mark.serial
 def test_sparse_nd_zeros():
     def check_sparse_nd_zeros(stype, shape):
         zero = mx.nd.zeros(shape)
         sparse_zero = mx.nd.zeros(shape=shape, stype=stype)
         assert_almost_equal(sparse_zero.asnumpy(), zero.asnumpy())

     shape = rand_shape_2d()
     check_sparse_nd_zeros('row_sparse', shape)
     check_sparse_nd_zeros('csr', shape)
     check_sparse_nd_zeros('default', shape)


 @pytest.mark.serial
 def test_sparse_nd_zeros_like():
     def check_sparse_nd_zeros_like(stype, shape):
         zero = mx.nd.zeros(shape, stype=stype)
         zero_like = mx.nd.sparse.zeros_like(zero)
         assert_almost_equal(zero.asnumpy(), zero_like.asnumpy())

     shape = rand_shape_2d()
     check_sparse_nd_zeros_like('row_sparse', shape)
     check_sparse_nd_zeros_like('csr', shape)


 @pytest.mark.serial
 def test_sparse_axis_operations():
     def test_variations(func_name):
         dim0 = 30
         dim1 = 100
         axes = [0, 1]
         densities = [0, 0.5, 1]
         for density in densities:
             shape = rand_shape_2d(dim0, dim1)
             csr_array = rand_ndarray(shape=shape, stype='csr', density=density)
             dns = csr_array.tostype('default')
             for axis in axes:
                 ret = func_name(csr_array, axis=axis)
                 assert ret.stype == 'default'
                 ret_expected = func_name(dns, axis=axis)
                 assert_almost_equal(ret.asnumpy(), ret_expected.asnumpy())

     def test_fallback(func_name, axis=0, keepdims=True, exclude=True):
         dim0 = 30
         dim1 = 100
         shape = rand_shape_2d(dim0, dim1)
         csr_array = rand_ndarray(shape=shape, stype='csr', density=0.01)
         ret= func_name(csr_array, axis=axis, keepdims=keepdims,
                        exclude=exclude)

     test_variations(mx.nd.sum)
     test_fallback(mx.nd.sum, axis=0, keepdims=True, exclude=True)
     test_variations(mx.nd.mean)
     test_fallback(mx.nd.mean, axis=0, keepdims=True, exclude=True)


 @pytest.mark.serial
 def test_sparse_square_sum():
     dim0 = 30
     dim1 = 30
     axes = [0, 1]
     keepdims = [False, True]
     densities = [0, 0.01, 0.2, 0.5, 1.0]
     for density in densities:
         shape = rand_shape_2d(dim0, dim1)
         rsp = rand_ndarray(shape, 'row_sparse', density)
         dns = rsp.tostype('default')
         for axis in axes:
             for keepdim in keepdims:
                 ret = mx.nd._internal._square_sum(rsp, axis=axis, keepdims=keepdim)
                 if axis == 1 and keepdim:
                     assert ret.stype == 'row_sparse'
                 else:
                     assert ret.stype == 'default'
                 ret_expected = mx.nd.sum(dns*dns, axis=axis, keepdims=keepdim)
                 # check forward result
                 assert_almost_equal(ret.asnumpy(), ret_expected.asnumpy())

                 rsp_data = mx.sym.Variable('data', stype='row_sparse')
                 test = mx.symbol._internal._square_sum(rsp_data, axis=axis, keepdims=keepdim)

                 # check symbolic backward since ograd can be an rsp
                 # and cannot be checked through check_numeric_gradient
                 # because it will add a loss layer as the output layer
                 # which makes ograd of the square_sum dense
                 if axis == 1 and keepdim:
                     dns_data = mx.sym.Variable('data')
                     baseline = mx.sym.sum(mx.sym.square(dns_data), axis=axis, keepdims=keepdim)
                     igrad_expected = mx.nd.empty(dns.shape)
                     baseline_exec = baseline._bind(default_device(), args=[dns],
                                                   args_grad=[igrad_expected])
                     baseline_exec.forward(is_train=True)
                     baseline_exec.backward([ret_expected])
                     # check backward when ograd is row sparse
                     check_symbolic_backward(test, [rsp], [ret_expected.tostype('row_sparse')],
                                             [igrad_expected.asnumpy()], grad_stypes={'data': 'row_sparse'})

                     # check backward when ograd is dense
                     # the stype of output of the square_sum is deteremined in symbol binding stage.
                     # The ograd stype of the last layer is the same as the output stype of the last layer.
                     # Need to add one more layer after square_sum to trigger the kernel for ograd
                     # with default stype in square_sum op.
                     baseline1 = baseline + 1
                     baseline_exec1 = baseline1._bind(default_device(), args=[dns],
                                                     args_grad=[igrad_expected])
                     baseline_exec1.forward(is_train=True)
                     baseline_exec1.backward([ret_expected])
                     test1 = test + 1
                     check_symbolic_backward(test1, [rsp], [ret_expected], [igrad_expected.asnumpy()],
                                             grad_stypes={'data': 'row_sparse'})

                 # check numeric gradient
                 check_numeric_gradient(test, [rsp], grad_stype_dict={'data': 'row_sparse'},
                                        atol=1e-2, rtol=0.1)


 @pytest.mark.serial
 @pytest.mark.flaky
 def test_sparse_storage_fallback():
     """ test operators which don't implement FComputeEx or FStatefulComputeEx """
     def check_broadcast_add(shape, lhs_stype, rhs_stype):
         lhs = mx.symbol.Variable('lhs', stype=lhs_stype)
         rhs = mx.symbol.Variable('rhs', stype=rhs_stype)
         lhs_nd = rand_ndarray(shape, lhs_stype)
         rhs_nd = rand_ndarray(shape, rhs_stype)
         lhs_dns = mx.nd.cast_storage(lhs_nd, stype='default')
         rhs_dns = mx.nd.cast_storage(rhs_nd, stype='default')

         out_dns = (lhs_dns + rhs_dns).asnumpy()
         test = mx.symbol.broadcast_add(lhs, rhs)
         location = {'lhs': lhs_nd, 'rhs': rhs_nd}
         check_symbolic_forward(test, location, [out_dns])
         check_numeric_gradient(test, location)
         check_symbolic_backward(test, location, [out_dns], [out_dns, out_dns])

     def np_softmax(x, axis=-1):
         # fix for old numpy on Travis not supporting keepdims
         x = x - np.max(x, axis=axis, keepdims=True)
         x = np.exp(x)
         x /= np.sum(x, axis=axis, keepdims=True)
         return x

     def check_concat(shape, lhs_stype, rhs_stype):
         x = mx.symbol.Variable('x', stype=lhs_stype)
         w = mx.symbol.Variable('w', stype=rhs_stype)
         test = mx.sym.Concat(x, w)
         x_nd = rand_ndarray(shape, lhs_stype)
         w_nd = rand_ndarray(shape, rhs_stype)
         location = {'x': x_nd, 'w': w_nd}
         check_numeric_gradient(test, location)

     def check_operator_with_temp_resource(shape, stype):
         x = mx.symbol.Variable('x', stype=stype)
         test = mx.sym.sum(x)
         x_nd = rand_ndarray(shape, stype)
         location = {'x': x_nd}
         check_numeric_gradient(test, location)

     shape = rand_shape_2d()
     stypes = ['default', 'csr', 'row_sparse']
     for lhs in stypes:
         check_operator_with_temp_resource(shape, lhs)
         for rhs in stypes:
             check_broadcast_add(shape, lhs, rhs)
             check_concat(shape, lhs, rhs)


 @pytest.mark.serial
 def test_sparse_elementwise_sum():
     def check_sparse_elementwise_sum_with_shape(stypes, shape, n):
         # forward
         inputs = [mx.symbol.Variable(f'arg{i}') for i in range(n)]
         out = mx.symbol.sparse.add_n(*inputs, name='esum')
         arr = []
         arr_grad = [mx.nd.empty(shape, stype=stype) for stype in stypes]
         densities = [0, 0.01, 0.5, 1.0]
         for stype in stypes:
             arr.append(rand_ndarray(shape, stype, densities[np.random.randint(0, len(densities))]))

         exec1 = out._bind(default_device(),
                          args=arr,
                          args_grad=arr_grad)
         exec1.forward(is_train=True)
         out1 = exec1.outputs[0].asnumpy()
         out = sum(a.asnumpy() for a in arr)
         assert_almost_equal(out, out1, atol=1e-5)

         out_grad = mx.nd.empty(shape)
         out_grad[:] = np.random.uniform(-10, 10, shape)
         # backward
         exec1.backward([out_grad])
         for a in arr_grad:
             assert_almost_equal(a.asnumpy(), out_grad.asnumpy(), atol=1e-5)

     all_stypes = ['default', 'csr', 'row_sparse']
     for dim in range(2, 4):
         shape = tuple(np.random.randint(5, 10, size=dim))
         rsp_test_cnt = np.random.randint(1, 9)
         check_sparse_elementwise_sum_with_shape(['row_sparse' for i in range(rsp_test_cnt)], shape, rsp_test_cnt)
         if dim is 2:
             check_sparse_elementwise_sum_with_shape(['default', 'csr', 'default'], shape, 3)
             test_len = np.random.randint(5, 10)
             # at least one default type
             stypes = ['default']
             for _ in range(test_len):
                 pick_side = np.random.randint(2)
                 pick_type = np.random.randint(3)
                 stypes = ([all_stypes[pick_type]] if pick_side is 0 else []) + stypes + ([all_stypes[pick_type]] if pick_side is 1 else [])
             check_sparse_elementwise_sum_with_shape(stypes, shape, test_len+1)


 @pytest.mark.serial
 def test_sparse_embedding():
     ''' test sparse embedding operator '''
     def check_sparse_embedding(in_dim, out_dim, batch, densities, sparse_grad):
         target_stype = 'row_sparse' if sparse_grad else 'default'
         # init executor
         data = mx.sym.Variable("data")
         weight = mx.sym.Variable("embed_weight")
         embed = mx.sym.sparse.Embedding(data=data, weight=weight, input_dim=in_dim,
                                         sparse_grad=sparse_grad, output_dim=out_dim, name='embed')
         grad_req = {'data': 'null', 'embed_weight': 'write'}
         args = {'embed_weight': mx.nd.zeros((in_dim, out_dim)), 'data': mx.nd.ones((batch,))}
         weight_grad = mx.nd.zeros((in_dim, out_dim))
         if sparse_grad:
             weight_grad = weight_grad.tostype('row_sparse')
         args_grad = {'embed_weight': weight_grad}
         exe_test = embed._bind(default_device(), args=args, args_grad=args_grad, grad_req=grad_req)
         arg_map = dict(zip(embed.list_arguments(), exe_test.arg_arrays))
         grad_map = dict(zip(embed.list_arguments(), exe_test.grad_arrays))
         # init data
         np_data = np.random.randint(low=0, high=in_dim, size=batch)
         np_onehot = np.zeros((batch, in_dim)).astype(np.float32)
         np_onehot[np.arange(batch), np_data] = 1.0
         arg_map["data"][:] = np_data
         # weight
         weight = arg_map["embed_weight"]
         for density in densities:
             # update weight based on density
             weight[:] = rand_ndarray(weight.shape, 'default', density=density)
             # check forward
             exe_test.forward(is_train=True)
             # init grad
             np_grad = np.random.uniform(-1, 1, exe_test.outputs[0].shape)
             grad = mx.nd.zeros(np_grad.shape)
             grad[:] = np_grad
             assert_almost_equal(exe_test.outputs[0].asnumpy(), np.dot(np_onehot, weight.asnumpy()), atol=1e-4)
             # check backward
             exe_test.backward([grad])
             assert_almost_equal(grad_map["embed_weight"].asnumpy(), np.dot(np_onehot.T, grad.asnumpy()), atol=1e-4)
             # check grad stype
             assert(grad_map["embed_weight"].stype == target_stype)

     densities = [0, 0.5, 1]
     in_dim = 50
     out_dim = 3
     batch = 8
     sparse_grads = [True, False]
     for sparse_grad in sparse_grads:
         check_sparse_embedding(in_dim, out_dim, batch, densities, sparse_grad)

 def test_sparse_broadcast_add_sub():
     def check_broadcast_add(mx_lhs, mx_rhs, np_lhs, np_rhs, dtype):
         assert_almost_equal(mx.nd.sparse.add(mx_lhs, mx_rhs).asnumpy(), np.add(np_lhs, np_rhs), atol=1e-4)
     def check_broadcast_sub(mx_lhs, mx_rhs, np_lhs, np_rhs, dtype):
         assert_almost_equal(mx.nd.sparse.subtract(mx_lhs, mx_rhs).asnumpy(), np.subtract(np_lhs, np_rhs), atol=1e-4)
     stype = 'csr'
     shape = rand_shape_2d()
     num_rows = shape[0]
     num_cols = shape[1]
     for density in [0.1 * i for i in range(10)]:
         mx_lhs = rand_ndarray(shape, stype, density)
         np_lhs = mx_lhs.asnumpy()
         mx_rhs_row_2D = rand_ndarray((1, num_cols), 'default')
         mx_rhs_row_1D = mx_rhs_row_2D.reshape((num_cols))
         mx_rhs_col = rand_ndarray((num_rows, 1), 'default')
         mx_rhs_scalar_2D = rand_ndarray((1, 1), 'default')
         mx_rhs_scalar_1D = mx_rhs_scalar_2D.reshape((1, ))
         for mx_rhs in [mx_rhs_row_2D, mx_rhs_row_1D, mx_rhs_col, mx_rhs_scalar_2D, mx_rhs_scalar_1D]:
             np_rhs = mx_rhs.asnumpy()
             check_broadcast_add(mx_lhs, mx_rhs, np_lhs, np_rhs, np.float32)
             check_broadcast_sub(mx_lhs, mx_rhs, np_lhs, np_rhs, np.float32)
             check_broadcast_add(mx_rhs, mx_lhs, np_rhs, np_lhs, np.float32)
             check_broadcast_sub(mx_rhs, mx_lhs, np_rhs, np_lhs, np.float32)

 def test_sparse_broadcast_mul_div():
     def check_broadcast_mul(mx_lhs, mx_rhs, np_lhs, np_rhs, dtype):
         assert_almost_equal(mx.nd.sparse.multiply(mx_lhs, mx_rhs).asnumpy(), np.multiply(np_lhs, np_rhs), atol=1e-4)
     def check_broadcast_div(mx_lhs, mx_rhs, np_lhs, np_rhs, dtype):
         assert_almost_equal(mx.nd.sparse.divide(mx_lhs, mx_rhs).asnumpy(), np.divide(np_lhs, np_rhs), atol=1e-4)
     stype = 'csr'
     shape = rand_shape_2d()
     num_rows = shape[0]
     num_cols = shape[1]
     for density in [0.1 * i for i in range(10)]:
         mx_lhs = rand_ndarray(shape, stype, density)
         np_lhs = mx_lhs.asnumpy()
         mx_rhs_row_2D = rand_ndarray((1, num_cols), 'default')
         mx_rhs_row_1D = mx_rhs_row_2D.reshape((num_cols))
         mx_rhs_col = rand_ndarray((num_rows, 1), 'default')
         mx_rhs_scalar_2D = rand_ndarray((1, 1), 'default')
         mx_rhs_scalar_1D = mx_rhs_scalar_2D.reshape((1, ))
         for mx_rhs in [mx_rhs_row_2D, mx_rhs_row_1D, mx_rhs_col, mx_rhs_scalar_2D, mx_rhs_scalar_1D]:
             np_rhs = mx_rhs.asnumpy()
             check_broadcast_mul(mx_lhs, mx_rhs, np_lhs, np_rhs, np.float32)
             check_broadcast_div(mx_lhs, mx_rhs, np_lhs, np_rhs, np.float32)

 def test_batchnorm_fallback():
     # same test as test_operator.test_batchnorm_training, but tests fallback logic of batchnorm
     stype = 'row_sparse'
     for shape in [(2, 3), (2, 3, 2, 2)]:
         data_tmp = np.random.normal(-0.1, 0.1, size=shape)
         s = shape[1],
         gamma = np.ones(s)
         beta = np.ones(s)
         gamma[1] = 3
         beta[0] = 3

         rolling_mean = np.random.uniform(size=s)
         rolling_std = np.random.uniform(size=s)

         data = mx.symbol.Variable('data', stype=stype)
         in_location = [mx.nd.array(data_tmp).tostype(stype), mx.nd.array(gamma).tostype(stype),
                         mx.nd.array(beta).tostype(stype)]
         mean_std = [mx.nd.array(rolling_mean).tostype(stype), mx.nd.array(rolling_std).tostype(stype)]

         test = mx.symbol.BatchNorm(data, fix_gamma=True)
         assertRaises(MXNetError, check_numeric_gradient, test, in_location, mean_std, numeric_eps=1e-3, rtol=0.16, atol=1e-2)

         test = mx.symbol.BatchNorm(data, fix_gamma=True, use_global_stats=True)
         assertRaises(MXNetError, check_numeric_gradient, test, in_location, mean_std, numeric_eps=1e-3, rtol=0.16, atol=1e-2)

         test = mx.symbol.BatchNorm(data, fix_gamma=False)
         check_numeric_gradient(test, in_location, mean_std, numeric_eps=1e-3, rtol=0.16, atol=1e-2)

         test = mx.symbol.BatchNorm(data, fix_gamma=False, use_global_stats=True)
         check_numeric_gradient(test, in_location, mean_std, numeric_eps=1e-3, rtol=0.16, atol=1e-2)

         # Test varying channel axis
         dim = len(shape)
         for chaxis in range(-dim, dim):
             chaxis_true = chaxis
             if chaxis < 0:
                 chaxis_true = dim + chaxis

             shapex = shape

             channel_count = shapex[chaxis_true]
             data_tmp = np.random.normal(-0.1, 0.1, size=shapex)

             gamma = np.ones(channel_count)
             beta = np.ones(channel_count)
             if channel_count > 1:
                 gamma[1] = 3
             beta[0] = 3

             in_location = [mx.nd.array(data_tmp).tostype(stype), mx.nd.array(gamma).tostype(stype),
                             mx.nd.array(beta).tostype(stype)]

             xrolling_mean = np.random.uniform(size=channel_count)
             xrolling_std = np.random.uniform(size=channel_count)
             xmean_std = [mx.nd.array(xrolling_mean).tostype(stype),
                             mx.nd.array(xrolling_std).tostype(stype)]

             test = mx.symbol.BatchNorm(data, fix_gamma=True, axis=chaxis)
             assertRaises(MXNetError, check_numeric_gradient, test, in_location, xmean_std, numeric_eps=1e-3, rtol=0.2, atol=0.01)

             test = mx.symbol.BatchNorm(data, fix_gamma=True, use_global_stats=True, axis=chaxis)
             assertRaises(MXNetError, check_numeric_gradient, test, in_location, xmean_std, numeric_eps=1e-3, rtol=0.2, atol=0.01)

             test = mx.symbol.BatchNorm(data, fix_gamma=False, axis=chaxis)
             check_numeric_gradient(test, in_location, xmean_std, numeric_eps=1e-3, rtol=0.2, atol=0.01)

             test = mx.symbol.BatchNorm(data, fix_gamma=False, use_global_stats=True, axis=chaxis)
             check_numeric_gradient(test, in_location, xmean_std, numeric_eps=1e-3, rtol=0.2, atol=0.01)


 @pytest.mark.serial
 def test_dnnl_sparse():
     # This test is trying to create a race condition describedd in
     # https://github.com/apache/incubator-mxnet/issues/10189
     arr = mx.nd.random.uniform(shape=(10, 10, 32, 32))
     weight1 = mx.nd.random.uniform(shape=(10, 10, 3, 3))
     arr = mx.nd.Convolution(data=arr, weight=weight1, no_bias=True, kernel=(3, 3), num_filter=10)

     rs_arr = mx.nd.sparse.row_sparse_array((mx.nd.zeros_like(arr), np.arange(arr.shape[0])))
     weight2 = mx.nd.random.uniform(shape=(10, np.prod(arr.shape[1:4])))
     fc_res = mx.nd.FullyConnected(data=arr, weight=weight2, no_bias=True, num_hidden=10)
     sum_res = mx.nd.elemwise_sub(arr, rs_arr)
     res1 = np.dot(mx.nd.flatten(sum_res).asnumpy(), weight2.asnumpy().T)
     print(res1 - fc_res.asnumpy())
     almost_equal(res1, fc_res.asnumpy())

 @pytest.mark.serial
 def test_sparse_nd_where():
     def get_forward_expected_output(condition, x, y):
         original_shape = x.shape
         out = np.zeros(original_shape)
         if condition.shape == x.shape:
             for index, c in np.ndenumerate(condition):
                 if c != 0:
                     out[index] = x[index]
                 else:
                     out[index] = y[index]
         else:
             raise RuntimeError("Invalid condition shape for where op")

         out = out.reshape(original_shape)
         return out

     def get_forward_inputs_same_shape(shape):
         condition_np = np.random.randint(0, 2, np.prod(shape)).reshape(shape)
         x_np = np.random.randint(1, 6, np.prod(shape)).reshape(shape)
         y_np = np.random.randint(7, 11, np.prod(shape)).reshape(shape)
         return condition_np, x_np, y_np

     def get_backward_input(shape):
         return np.random.randint(20, 30, np.prod(shape)).reshape(shape)

     def get_backward_expected_outputs(grad_in, condition):
         shape = grad_in.shape
         grad_cond = np.zeros(condition.shape)
         grad_x = np.empty(shape)
         grad_y = np.empty(shape)

         for index, c in np.ndenumerate(condition):
             if 0 != c:
                 grad_x[index] = grad_in[index]
                 grad_y[index] = 0
             else:
                 grad_x[index] = 0
                 grad_y[index] = grad_in[index]

         return grad_cond, grad_x, grad_y

     def test_where_helper(shape):
         condition_np, x_np, y_np = get_forward_inputs_same_shape(shape)

         out_expected = get_forward_expected_output(condition_np, x_np, y_np)

         grad_in_np = get_backward_input(shape)
         grad_expected_cond, grad_expected_x, grad_expected_y \
             = get_backward_expected_outputs(grad_in_np, condition_np)

         condition = mx.sym.Variable('condition', stype='csr')
         x = mx.sym.Variable('x')
         y = mx.sym.Variable('y')
         grad_in_mx = mx.nd.array(grad_in_np, dtype=np.int32)
         where_sym = mx.sym.where(condition, x, y)

         cond_nd = mx.nd.array(condition_np)
         args = {'condition': cond_nd.tostype('csr'), 'x': mx.nd.array(x_np),
                 'y' : mx.nd.array(y_np)}
         args_grad = {'condition': mx.nd.zeros_like(cond_nd),
                      'x': mx.nd.array(x_np).tostype('csr'), 'y' : mx.nd.array(y_np)}
         # test req='write'
         where_exe_write = where_sym._bind(ctx=default_device(), args=args,
                                          args_grad=args_grad, grad_req='write')

         # test forward req='write'
         outputs = where_exe_write.forward(is_train=True)
         assert same(outputs[0].asnumpy(), out_expected)
         # test backward req='write'
         where_exe_write.backward(grad_in_mx.astype('float32'))
         assert same(where_exe_write.grad_dict['x'].asnumpy(), grad_expected_x)
         assert same(where_exe_write.grad_dict['y'].asnumpy(), grad_expected_y)
         assert same(where_exe_write.grad_dict['condition'].asnumpy(), grad_expected_cond)

         # test req='add'
         x_grad_init = np.random.randint(30, 40, np.prod(shape)).reshape(shape)
         y_grad_init = np.random.randint(40, 50, np.prod(shape)).reshape(shape)
         where_exe_add = where_sym._bind(ctx=default_device(), args=args,
                                        args_grad=args_grad, grad_req='add')
         where_exe_add.grad_dict['x'][:] = x_grad_init
         where_exe_add.grad_dict['y'][:] = y_grad_init
         # test forward req='add'
         outputs = where_exe_add.forward(is_train=True)
         assert same(outputs[0].asnumpy(), out_expected)

     def test_where_numeric_gradient(shape):
         condition = mx.sym.Variable('condition', stype='csr')
         x = mx.sym.Variable('x')
         y = mx.sym.Variable('y')
         where_sym = mx.sym.where(condition, x, y)
         condition_np, x_np, y_np = get_forward_inputs_same_shape(shape)
         check_numeric_gradient(where_sym, [condition_np, x_np, y_np], grad_nodes=['x', 'y'])

     test_where_helper((5, 9))
     test_where_numeric_gradient((5, 9))

 @pytest.mark.serial
 def test_sparse_quadratic_function():
     def f(x, a, b, c):
         return a * x**2 + b * x + c

     def check_sparse_quadratic_function(a, b, c, expected_stype):
       # check forward and compare the result with dense op
       ndim = 2
       shape = rand_shape_nd(ndim, 5)
       data = rand_ndarray(shape=shape, stype='csr')
       data_np = data.asnumpy()
       expected = f(data_np, a, b, c)
       output = mx.nd.contrib.quadratic(data, a=a, b=b, c=c)
       assert(output.stype == expected_stype)
       assert_almost_equal(output.asnumpy(), expected)

     a = np.random.random_sample()
     b = np.random.random_sample()
     check_sparse_quadratic_function(a, b, 0.0, 'csr')
     check_sparse_quadratic_function(a, b, 1.0, 'default')

 def test_reshape_backward_fallback():
     """
      out
      |  \
     w_x  x
      /
     w
     in which x is a sparse tensor.
     Due to sparse gradient optimization in sym.dot, grad(w_x) is sparse.
     Though sym.reshape itself does not have sparse version,
     if we somehow make grad(w) sparse as well, e.g.,
         - by setting args_grad in symbol.bind
         - or, we can have out_y = sym.dot(sparse_y, w), then grad(w) will be inferred as sparse
     reshape backward (from w_x to w) needs to understand how to handle sparse inputs.
     """
     ctx = default_device()
     w_shape = (12, 4)
     w_x_shape = (1, 48)
     x_nd = rand_ndarray((4, 1), 'csr')

     w_nd = rand_ndarray(w_shape)

     w_x_nd = w_nd.reshape(w_x_shape)
     out_x_nd = mx.nd.dot(x_nd, w_x_nd)

     w_x_backward_grad = mx.nd.dot(x_nd, out_x_nd, transpose_a=True).asnumpy()
     expected_grad_nd = w_x_backward_grad.reshape(w_shape)

     x = mx.sym.Variable('x', stype='csr')
     w = mx.sym.Variable('w')

     w_x = mx.sym.reshape(w, w_x_shape, name="w_x")
     out = mx.sym.sparse.dot(x, w_x, name='out_x')

     grad_w_nd = rand_ndarray(w_shape, 'row_sparse')
     executor = out._bind(ctx=ctx, args={"x": x_nd, "w": w_nd},
                         args_grad={"w": grad_w_nd})
     executor.forward(is_train=True)
     executor.backward(out_x_nd)

     assert_almost_equal(grad_w_nd.asnumpy(), expected_grad_nd)