tests/python/gpu/test_gluon_gpu.py - mxnet - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.

 from __future__ import print_function
 import sys
 import os
 import tempfile
 import time
 import multiprocessing as mp
 import unittest
 import random
 import mxnet as mx
 import numpy as np
 import unittest
 import math
 from nose.tools import assert_raises
 from mxnet.test_utils import check_consistency, set_default_context, assert_almost_equal
 from mxnet.base import MXNetError
 from mxnet import autograd
 from numpy.testing import assert_allclose
 from mxnet.test_utils import rand_ndarray


 curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
 sys.path.insert(0, os.path.join(curr_path, '../unittest'))
 from common import setup_module, with_seed, teardown, assert_raises_cudnn_not_satisfied
 from common import run_in_spawned_process
 from test_gluon import *
 from test_loss import *
 from test_gluon_rnn import *

 set_default_context(mx.gpu(0))


 def check_rnn_layer(layer):
     layer.collect_params().initialize(ctx=[mx.cpu(0), mx.gpu(0)])
     with mx.gpu(0):
         x = mx.nd.ones((10, 16, 30))
         states = layer.begin_state(16)
         go, gs = layer(x, states)

     with mx.cpu(0):
         x = mx.nd.ones((10, 16, 30))
         states = layer.begin_state(16)
         co, cs = layer(x, states)

     # atol of 1e-6 required, as exposed by seed 2124685726
     assert_almost_equal(go.asnumpy(), co.asnumpy(), rtol=1e-2, atol=1e-6)
     for g, c in zip(gs, cs):
         assert_almost_equal(g.asnumpy(), c.asnumpy(), rtol=1e-2, atol=1e-6)


 @with_seed()
 def check_rnn_layer_w_rand_inputs(layer):
     layer.collect_params().initialize(ctx=[mx.cpu(0), mx.gpu(0)])
     x = mx.nd.uniform(shape=(10, 16, 30))
     with mx.gpu(0):
         x = x.copyto(mx.gpu(0))
         states = layer.begin_state(16)
         go, gs = layer(x, states)

     with mx.cpu(0):
         x = x.copyto(mx.cpu(0))
         states = layer.begin_state(16)
         co, cs = layer(x, states)

     assert_almost_equal(go.asnumpy(), co.asnumpy(), rtol=1e-2, atol=1e-6)
     for g, c in zip(gs, cs):
         assert_almost_equal(g.asnumpy(), c.asnumpy(), rtol=1e-2, atol=1e-6)


 @with_seed()
 @assert_raises_cudnn_not_satisfied(min_version='7.2.1')
 def test_lstmp():
     hidden_size, projection_size = 3, 2
     rtol, atol = 1e-2, 1e-2
     batch_size, seq_len = 7, 11
     input_size = 5
     ctx = mx.gpu(0)
     lstm_input = mx.nd.uniform(
         shape=(seq_len, batch_size, input_size), ctx=ctx)
     shapes = {'i2h_weight': (hidden_size * 4, input_size),
               'h2h_weight': (hidden_size * 4, projection_size),
               'i2h_bias': (hidden_size * 4,),
               'h2h_bias': (hidden_size * 4,),
               'h2r_weight': (projection_size, hidden_size)}
     weights = {k: rand_ndarray(v) for k, v in shapes.items()}
     lstm_layer = gluon.rnn.LSTM(hidden_size, projection_size=projection_size,
                                 input_size=input_size, prefix='lstm0_')
     lstm_cell = gluon.contrib.rnn.LSTMPCell(hidden_size=hidden_size,
                                             projection_size=projection_size,
                                             input_size=input_size,
                                             prefix='lstm0_l0_')
     lstm_layer.initialize(ctx=ctx)
     lstm_cell.initialize(ctx=ctx)
     layer_params = lstm_layer.collect_params()
     cell_params = lstm_cell.collect_params()
     for k, v in weights.items():
         layer_params['lstm0_l0_' + k].set_data(v.copy())
         cell_params['lstm0_l0_' + k].set_data(v.copy())
     with autograd.record():
         layer_output = lstm_layer(lstm_input.copy())
         cell_output = lstm_cell.unroll(seq_len, lstm_input.copy(), layout='TNC',
                                        merge_outputs=True)[0]
     assert_almost_equal(layer_output.asnumpy(),
                         cell_output.asnumpy(), rtol=rtol, atol=atol)
     layer_output.backward()
     cell_output.backward()
     for k, v in weights.items():
         layer_grad = layer_params['lstm0_l0_' + k].grad()
         cell_grad = cell_params['lstm0_l0_' + k].grad()
         print('checking gradient for {}'.format('lstm0_l0_' + k))
         assert_almost_equal(layer_grad.asnumpy(), cell_grad.asnumpy(),
                             rtol=rtol, atol=atol)
     check_rnn_layer_forward(gluon.rnn.LSTM(
         10, 2, projection_size=5), mx.nd.ones((8, 3, 20)), ctx=ctx)
     check_rnn_layer_forward(gluon.rnn.LSTM(10, 2, projection_size=5, bidirectional=True), mx.nd.ones(
         (8, 3, 20)), [mx.nd.ones((4, 3, 5)), mx.nd.ones((4, 3, 10))], ctx=ctx)

     check_rnn_layer_forward(gluon.rnn.LSTM(10, 2, dropout=0.5, projection_size=5), mx.nd.ones((8, 3, 20)),
                             run_only=True, ctx=ctx)
     check_rnn_layer_forward(gluon.rnn.LSTM(10, 2, bidirectional=True, dropout=0.5, projection_size=5),
                             mx.nd.ones((8, 3, 20)),
                             [mx.nd.ones((4, 3, 5)), mx.nd.ones((4, 3, 10))], run_only=True, ctx=ctx)


 @with_seed()
 @assert_raises_cudnn_not_satisfied(min_version='7.2.1')
 def test_lstm_clip():
     hidden_size, projection_size = 4096, 2048
     batch_size, seq_len = 32, 80
     input_size = 50
     clip_min, clip_max, clip_nan = -5, 5, True
     lstm_input = mx.nd.uniform(
         shape=(seq_len, batch_size, input_size), ctx=mx.gpu(0))
     lstm_states = [mx.nd.uniform(shape=(2, batch_size, projection_size), ctx=mx.gpu(0)),
                    mx.nd.uniform(shape=(2, batch_size, hidden_size), ctx=mx.gpu(0))]
     lstm_layer = gluon.rnn.LSTM(hidden_size, projection_size=projection_size,
                                 input_size=input_size, prefix='lstm0_',
                                 bidirectional=True,
                                 state_clip_min=clip_min,
                                 state_clip_max=clip_max,
                                 state_clip_nan=clip_nan)
     lstm_layer.initialize(ctx=mx.gpu(0))
     with autograd.record():
         _, layer_output_states = lstm_layer(lstm_input, lstm_states)
     cell_states = layer_output_states[0].asnumpy()
     assert (cell_states >= clip_min).all() and (cell_states <= clip_max).all()
     assert not np.isnan(cell_states).any()


 @with_seed()
 @assert_raises_cudnn_not_satisfied(min_version='5.1.10')
 def test_rnn_layer():
     check_rnn_layer(gluon.rnn.RNN(100, num_layers=3))
     check_rnn_layer(gluon.rnn.RNN(100, activation='tanh', num_layers=3))
     check_rnn_layer(gluon.rnn.LSTM(100, num_layers=3))
     check_rnn_layer(gluon.rnn.GRU(100, num_layers=3))

     check_rnn_layer(gluon.rnn.LSTM(100, num_layers=3, bidirectional=True))
     check_rnn_layer_w_rand_inputs(gluon.rnn.LSTM(
         100, num_layers=3, bidirectional=True))


 def check_layer_bidirectional(size, in_size, proj_size):
     class RefBiLSTM(gluon.Block):
         def __init__(self, size, proj_size, **kwargs):
             super(RefBiLSTM, self).__init__(**kwargs)
             with self.name_scope():
                 self._lstm_fwd = gluon.rnn.LSTM(
                     size, projection_size=proj_size, bidirectional=False, prefix='l0')
                 self._lstm_bwd = gluon.rnn.LSTM(
                     size, projection_size=proj_size, bidirectional=False, prefix='r0')

         def forward(self, inpt):
             fwd = self._lstm_fwd(inpt)
             bwd_inpt = nd.flip(inpt, 0)
             bwd = self._lstm_bwd(bwd_inpt)
             bwd = nd.flip(bwd, 0)
             return nd.concat(fwd, bwd, dim=2)
     weights = {}
     for d in ['l', 'r']:
         weights['lstm_{}0_i2h_weight'.format(d)] = mx.random.uniform(
             shape=(size * 4, in_size))
         if proj_size:
             weights['lstm_{}0_h2h_weight'.format(d)] = mx.random.uniform(
                 shape=(size * 4, proj_size))
             weights['lstm_{}0_h2r_weight'.format(d)] = mx.random.uniform(
                 shape=(proj_size, size))
         else:
             weights['lstm_{}0_h2h_weight'.format(
                 d)] = mx.random.uniform(shape=(size * 4, size))
         weights['lstm_{}0_i2h_bias'.format(
             d)] = mx.random.uniform(shape=(size * 4,))
         weights['lstm_{}0_h2h_bias'.format(
             d)] = mx.random.uniform(shape=(size * 4,))

     net = gluon.rnn.LSTM(size, projection_size=proj_size,
                          bidirectional=True, prefix='lstm_')
     ref_net = RefBiLSTM(size, proj_size, prefix='lstm_')
     net.initialize()
     ref_net.initialize()
     net_params = net.collect_params()
     ref_net_params = ref_net.collect_params()
     for k in weights:
         net_params[k].set_data(weights[k])
         ref_net_params[k.replace('l0', 'l0l0').replace(
             'r0', 'r0l0')].set_data(weights[k])

     data = mx.random.uniform(shape=(11, 10, in_size))
     assert_allclose(net(data).asnumpy(), ref_net(data).asnumpy())


 @with_seed()
 @assert_raises_cudnn_not_satisfied(min_version='5.1.10')
 def test_layer_bidirectional():
     check_layer_bidirectional(7, 5, 0)


 @with_seed()
 @assert_raises_cudnn_not_satisfied(min_version='7.2.1')
 def test_layer_bidirectional_proj():
     check_layer_bidirectional(7, 5, 3)


 @with_seed()
 @assert_raises_cudnn_not_satisfied(min_version='5.1.10')
 def test_rnn_layer_begin_state_type():
     fake_data = nd.random.uniform(shape=(3, 5, 7), dtype='float16')
     modeling_layer = gluon.rnn.LSTM(
         hidden_size=11, num_layers=2, dropout=0.2, bidirectional=True)
     modeling_layer.cast('float16')
     modeling_layer.initialize()
     modeling_layer(fake_data)


 def test_gluon_ctc_consistency():
     loss = mx.gluon.loss.CTCLoss()
     data = mx.nd.arange(0, 4, repeat=40, ctx=mx.gpu(0)
                         ).reshape((2, 20, 4)).flip(axis=0)
     cpu_label = mx.nd.array([[2, 1, -1, -1], [3, 2, 2, -1]], ctx=mx.cpu(0))
     gpu_label = mx.nd.array([[2, 1, -1, -1], [3, 2, 2, -1]], ctx=mx.gpu(0))

     cpu_data = data.copy().as_in_context(mx.cpu(0))
     cpu_data.attach_grad()
     with mx.autograd.record():
         l_cpu = loss(cpu_data, cpu_label)
         l_cpu.backward()

     gpu_data = data.copyto(mx.gpu(0))
     gpu_data.attach_grad()
     with mx.autograd.record():
         l_gpu = loss(gpu_data, gpu_label)
         l_gpu.backward()

     assert_almost_equal(cpu_data.grad.asnumpy(),
                         gpu_data.grad.asnumpy(), atol=1e-3, rtol=1e-3)


 @with_seed()
 def test_global_norm_clip_multi_device():
     for check_isfinite in [True, False]:
         x1 = mx.nd.ones((3, 3), ctx=mx.gpu(0))
         x2 = mx.nd.ones((4, 4), ctx=mx.cpu(0))
         norm = gluon.utils.clip_global_norm(
             [x1, x2], 1.0, check_isfinite=check_isfinite)
         if check_isfinite:
             assert norm == 5.0
         else:
             assert norm.asscalar() == 5.0
         assert_almost_equal(x1.asnumpy(), np.ones((3, 3)) / 5)
         assert_almost_equal(x2.asnumpy(), np.ones((4, 4)) / 5)


 @with_seed()
 def test_symbol_block_fp16():
     # Test case to verify if initializing the SymbolBlock from a model with params
     # other than fp32 param dtype.

     # 1. Load a resnet model, cast it to fp16 and export
     tmp = tempfile.mkdtemp()
     tmpfile = os.path.join(tmp, 'resnet34_fp16')
     ctx = mx.gpu(0)

     net_fp32 = mx.gluon.model_zoo.vision.resnet34_v2(
         pretrained=True, ctx=ctx, root=tmp)
     net_fp32.cast('float16')
     net_fp32.hybridize()
     data = mx.nd.zeros((1, 3, 224, 224), dtype='float16', ctx=ctx)
     net_fp32.forward(data)
     net_fp32.export(tmpfile, 0)

     # 2. Load the saved model and verify if all the params are loaded correctly.
     # and choose one of the param to verify the type if fp16.
     sm = mx.sym.load(tmpfile + '-symbol.json')
     inputs = mx.sym.var('data', dtype='float16')
     net_fp16 = mx.gluon.SymbolBlock(sm, inputs)
     net_fp16.collect_params().load(tmpfile + '-0000.params', ctx=ctx)
     # 3. Get a conv layer's weight parameter name. Conv layer's weight param is
     # expected to be of dtype casted, fp16.
     for param_name in net_fp16.params.keys():
         if 'conv' in param_name and 'weight' in param_name:
             break
     assert np.dtype(net_fp16.params[param_name].dtype) == np.dtype(np.float16)


 @with_seed()
 def test_large_models():
     ctx = default_context()
     # Create model
     net = gluon.nn.HybridSequential()

     largest_num_features = 256
     with net.name_scope():
         net.add(nn.Conv2D(largest_num_features, 3))

     net.hybridize()
     net.initialize(mx.init.Normal(sigma=0.01), ctx=ctx)

     # Compute the height (=width) of the square tensor of the given size in bytes
     def tensor_size(big_tensor_bytes):
         bytes_per_float = 4
         sz = int(math.sqrt(big_tensor_bytes /
                            largest_num_features / bytes_per_float))
         return (sz // 100) * 100

     # The idea is to create models with large tensors of (say) 20% of the total memory.
     # This in the past has given cudnnFind() trouble when it needed to allocate similar I/O's
     # from the area carved out by the MXNET_GPU_MEM_POOL_RESERVE setting (by default 5%).
     (free_mem_bytes, total_mem_bytes) = mx.context.gpu_memory_info(ctx.device_id)
     start_size = tensor_size(0.20 * total_mem_bytes)
     num_trials = 10
     sys.stderr.write(
         ' testing global memory of size {} ... '.format(total_mem_bytes))
     sys.stderr.flush()
     for i in range(num_trials):
         sz = start_size - 10 * i
         (height, width) = (sz, sz)
         sys.stderr.write(" {}x{} ".format(height, width))
         sys.stderr.flush()
         data_in = nd.random_uniform(low=0, high=255, shape=(1, 3, height, width),
                                     ctx=ctx, dtype="float32")
         # Evaluate model
         net(data_in).asnumpy()

 # isolated execution bulking test function to be invoked with different env var settings


 def _test_bulking_in_process(seed, time_per_iteration):
     # Use flip since it's a simple function with same-sized I/O unlikely to ever be fused.
     class Flip(gluon.HybridBlock):
         def __init__(self, **kwargs):
             super(Flip, self).__init__(**kwargs)

         def hybrid_forward(self, F, x):
             return F.flip(x, axis=0)

     def get_net(num_ops):
         net = nn.HybridSequential()
         with net.name_scope():
             for _ in range(num_ops):
                 net.add(Flip())
         return net

     data_shape = (10,)
     num_ops = 1000
     num_iterations = 20

     # build model
     x = mx.ndarray.zeros(data_shape)
     x.attach_grad()
     dy = mx.ndarray.ones(data_shape)
     net = get_net(num_ops)
     net.hybridize(static_alloc=True, static_shape=True)

     # time a number of forward() and backward() executions after some warm-up iterations
     warmups = 1
     for i in range(num_iterations + warmups):
         with autograd.record():
             if i == warmups:
                 start = time.time()
             y = net(x)
             y.backward(dy)
             x.grad.wait_to_read()

     time_per_iteration.value = (time.time() - start) / num_iterations


 @with_seed()
 def test_bulking():
     # test case format: (max_fwd_segment_size, max_bwd_segment_size, enable_bulking_in_training)
     test_cases = [(0, 0, True), (1, 1, True), (15, 15, False),
                   (15, 0, True), (0, 15, True), (15, 15, True)]
     times = {}
     times_str = ''
     for seg_sizes in test_cases:
         # Create shared variable to return measured time from test process
         time_per_iteration = mp.Manager().Value('d', 0.0)
         if not run_in_spawned_process(_test_bulking_in_process,
                                       {'MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN_FWD': seg_sizes[0],
                                        'MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN_BWD': seg_sizes[1],
                                        'MXNET_EXEC_BULK_EXEC_TRAIN': seg_sizes[2]},
                                       time_per_iteration):
             # skip test since the python version can't run it properly.  Warning msg was logged.
             return
         times[seg_sizes] = time_per_iteration.value
         times_str += \
             '\n    runtime of (fwd,bwd,enable) op seg setting ({},{},{}) =\t{:.1f} msec'.format(
                 seg_sizes[0], seg_sizes[1], seg_sizes[2], 1000.0 * times[seg_sizes])

     fastest_non_bulked_time = min(
         times[(0, 0, True)], times[(1, 1, True)], times[(15, 15, False)])
     slowest_half_bulked_time = max(times[(0, 15, True)], times[(15, 0, True)])
     fastest_half_bulked_time = min(times[(0, 15, True)], times[(15, 0, True)])
     fully_bulked_time = times[(15, 15, True)]

     print(times_str)
     # Non-bulked times[0,0,True], times[1,1,True] and times[15,15,False] should be about the same,
     # slower than both half-bulked times[0,15,True] and times[15,0,True]
     assert slowest_half_bulked_time < fastest_non_bulked_time, \
         'A half-bulked exec time is slower than the non-bulked time by {} secs! {}' \
         .format(slowest_half_bulked_time - fastest_non_bulked_time, times_str)
     # The fully bulked times[15,15,True] should be faster than both half-bulked runs
     assert fully_bulked_time < fastest_half_bulked_time, \
         'The fully-bulked exec time is slower than a half-bulked time by {} secs! {}' \
         .format(fully_bulked_time - fastest_half_bulked_time, times_str)


 if __name__ == '__main__':
     import nose
     nose.runmodule()
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.

	from __future__ import print_function
	import sys
	import os
	import tempfile
	import time
	import multiprocessing as mp
	import unittest
	import random
	import mxnet as mx
	import numpy as np
	import unittest
	import math
	from nose.tools import assert_raises
	from mxnet.test_utils import check_consistency, set_default_context, assert_almost_equal
	from mxnet.base import MXNetError
	from mxnet import autograd
	from numpy.testing import assert_allclose
	from mxnet.test_utils import rand_ndarray


	curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
	sys.path.insert(0, os.path.join(curr_path, '../unittest'))
	from common import setup_module, with_seed, teardown, assert_raises_cudnn_not_satisfied
	from common import run_in_spawned_process
	from test_gluon import *
	from test_loss import *
	from test_gluon_rnn import *

	set_default_context(mx.gpu(0))


	def check_rnn_layer(layer):
	layer.collect_params().initialize(ctx=[mx.cpu(0), mx.gpu(0)])
	with mx.gpu(0):
	x = mx.nd.ones((10, 16, 30))
	states = layer.begin_state(16)
	go, gs = layer(x, states)

	with mx.cpu(0):
	x = mx.nd.ones((10, 16, 30))
	states = layer.begin_state(16)
	co, cs = layer(x, states)

	# atol of 1e-6 required, as exposed by seed 2124685726
	assert_almost_equal(go.asnumpy(), co.asnumpy(), rtol=1e-2, atol=1e-6)
	for g, c in zip(gs, cs):
	assert_almost_equal(g.asnumpy(), c.asnumpy(), rtol=1e-2, atol=1e-6)


	@with_seed()
	def check_rnn_layer_w_rand_inputs(layer):
	layer.collect_params().initialize(ctx=[mx.cpu(0), mx.gpu(0)])
	x = mx.nd.uniform(shape=(10, 16, 30))
	with mx.gpu(0):
	x = x.copyto(mx.gpu(0))
	states = layer.begin_state(16)
	go, gs = layer(x, states)

	with mx.cpu(0):
	x = x.copyto(mx.cpu(0))
	states = layer.begin_state(16)
	co, cs = layer(x, states)

	assert_almost_equal(go.asnumpy(), co.asnumpy(), rtol=1e-2, atol=1e-6)
	for g, c in zip(gs, cs):
	assert_almost_equal(g.asnumpy(), c.asnumpy(), rtol=1e-2, atol=1e-6)


	@with_seed()
	@assert_raises_cudnn_not_satisfied(min_version='7.2.1')
	def test_lstmp():
	hidden_size, projection_size = 3, 2
	rtol, atol = 1e-2, 1e-2
	batch_size, seq_len = 7, 11
	input_size = 5
	ctx = mx.gpu(0)
	lstm_input = mx.nd.uniform(
	shape=(seq_len, batch_size, input_size), ctx=ctx)
	shapes = {'i2h_weight': (hidden_size * 4, input_size),
	'h2h_weight': (hidden_size * 4, projection_size),
	'i2h_bias': (hidden_size * 4,),
	'h2h_bias': (hidden_size * 4,),
	'h2r_weight': (projection_size, hidden_size)}
	weights = {k: rand_ndarray(v) for k, v in shapes.items()}
	lstm_layer = gluon.rnn.LSTM(hidden_size, projection_size=projection_size,
	input_size=input_size, prefix='lstm0_')
	lstm_cell = gluon.contrib.rnn.LSTMPCell(hidden_size=hidden_size,
	projection_size=projection_size,
	input_size=input_size,
	prefix='lstm0_l0_')
	lstm_layer.initialize(ctx=ctx)
	lstm_cell.initialize(ctx=ctx)
	layer_params = lstm_layer.collect_params()
	cell_params = lstm_cell.collect_params()
	for k, v in weights.items():
	layer_params['lstm0_l0_' + k].set_data(v.copy())
	cell_params['lstm0_l0_' + k].set_data(v.copy())
	with autograd.record():
	layer_output = lstm_layer(lstm_input.copy())
	cell_output = lstm_cell.unroll(seq_len, lstm_input.copy(), layout='TNC',
	merge_outputs=True)[0]
	assert_almost_equal(layer_output.asnumpy(),
	cell_output.asnumpy(), rtol=rtol, atol=atol)
	layer_output.backward()
	cell_output.backward()
	for k, v in weights.items():
	layer_grad = layer_params['lstm0_l0_' + k].grad()
	cell_grad = cell_params['lstm0_l0_' + k].grad()
	print('checking gradient for {}'.format('lstm0_l0_' + k))
	assert_almost_equal(layer_grad.asnumpy(), cell_grad.asnumpy(),
	rtol=rtol, atol=atol)
	check_rnn_layer_forward(gluon.rnn.LSTM(
	10, 2, projection_size=5), mx.nd.ones((8, 3, 20)), ctx=ctx)
	check_rnn_layer_forward(gluon.rnn.LSTM(10, 2, projection_size=5, bidirectional=True), mx.nd.ones(
	(8, 3, 20)), [mx.nd.ones((4, 3, 5)), mx.nd.ones((4, 3, 10))], ctx=ctx)

	check_rnn_layer_forward(gluon.rnn.LSTM(10, 2, dropout=0.5, projection_size=5), mx.nd.ones((8, 3, 20)),
	run_only=True, ctx=ctx)
	check_rnn_layer_forward(gluon.rnn.LSTM(10, 2, bidirectional=True, dropout=0.5, projection_size=5),
	mx.nd.ones((8, 3, 20)),
	[mx.nd.ones((4, 3, 5)), mx.nd.ones((4, 3, 10))], run_only=True, ctx=ctx)


	@with_seed()
	@assert_raises_cudnn_not_satisfied(min_version='7.2.1')
	def test_lstm_clip():
	hidden_size, projection_size = 4096, 2048
	batch_size, seq_len = 32, 80
	input_size = 50
	clip_min, clip_max, clip_nan = -5, 5, True
	lstm_input = mx.nd.uniform(
	shape=(seq_len, batch_size, input_size), ctx=mx.gpu(0))
	lstm_states = [mx.nd.uniform(shape=(2, batch_size, projection_size), ctx=mx.gpu(0)),
	mx.nd.uniform(shape=(2, batch_size, hidden_size), ctx=mx.gpu(0))]
	lstm_layer = gluon.rnn.LSTM(hidden_size, projection_size=projection_size,
	input_size=input_size, prefix='lstm0_',
	bidirectional=True,
	state_clip_min=clip_min,
	state_clip_max=clip_max,
	state_clip_nan=clip_nan)
	lstm_layer.initialize(ctx=mx.gpu(0))
	with autograd.record():
	_, layer_output_states = lstm_layer(lstm_input, lstm_states)
	cell_states = layer_output_states[0].asnumpy()
	assert (cell_states >= clip_min).all() and (cell_states <= clip_max).all()
	assert not np.isnan(cell_states).any()


	@with_seed()
	@assert_raises_cudnn_not_satisfied(min_version='5.1.10')
	def test_rnn_layer():
	check_rnn_layer(gluon.rnn.RNN(100, num_layers=3))
	check_rnn_layer(gluon.rnn.RNN(100, activation='tanh', num_layers=3))
	check_rnn_layer(gluon.rnn.LSTM(100, num_layers=3))
	check_rnn_layer(gluon.rnn.GRU(100, num_layers=3))

	check_rnn_layer(gluon.rnn.LSTM(100, num_layers=3, bidirectional=True))
	check_rnn_layer_w_rand_inputs(gluon.rnn.LSTM(
	100, num_layers=3, bidirectional=True))


	def check_layer_bidirectional(size, in_size, proj_size):
	class RefBiLSTM(gluon.Block):
	def __init__(self, size, proj_size, **kwargs):
	super(RefBiLSTM, self).__init__(**kwargs)
	with self.name_scope():
	self._lstm_fwd = gluon.rnn.LSTM(
	size, projection_size=proj_size, bidirectional=False, prefix='l0')
	self._lstm_bwd = gluon.rnn.LSTM(
	size, projection_size=proj_size, bidirectional=False, prefix='r0')

	def forward(self, inpt):
	fwd = self._lstm_fwd(inpt)
	bwd_inpt = nd.flip(inpt, 0)
	bwd = self._lstm_bwd(bwd_inpt)
	bwd = nd.flip(bwd, 0)
	return nd.concat(fwd, bwd, dim=2)
	weights = {}
	for d in ['l', 'r']:
	weights['lstm_{}0_i2h_weight'.format(d)] = mx.random.uniform(
	shape=(size * 4, in_size))
	if proj_size:
	weights['lstm_{}0_h2h_weight'.format(d)] = mx.random.uniform(
	shape=(size * 4, proj_size))
	weights['lstm_{}0_h2r_weight'.format(d)] = mx.random.uniform(
	shape=(proj_size, size))
	else:
	weights['lstm_{}0_h2h_weight'.format(
	d)] = mx.random.uniform(shape=(size * 4, size))
	weights['lstm_{}0_i2h_bias'.format(
	d)] = mx.random.uniform(shape=(size * 4,))
	weights['lstm_{}0_h2h_bias'.format(
	d)] = mx.random.uniform(shape=(size * 4,))

	net = gluon.rnn.LSTM(size, projection_size=proj_size,
	bidirectional=True, prefix='lstm_')
	ref_net = RefBiLSTM(size, proj_size, prefix='lstm_')
	net.initialize()
	ref_net.initialize()
	net_params = net.collect_params()
	ref_net_params = ref_net.collect_params()
	for k in weights:
	net_params[k].set_data(weights[k])
	ref_net_params[k.replace('l0', 'l0l0').replace(
	'r0', 'r0l0')].set_data(weights[k])

	data = mx.random.uniform(shape=(11, 10, in_size))
	assert_allclose(net(data).asnumpy(), ref_net(data).asnumpy())


	@with_seed()
	@assert_raises_cudnn_not_satisfied(min_version='5.1.10')
	def test_layer_bidirectional():
	check_layer_bidirectional(7, 5, 0)


	@with_seed()
	@assert_raises_cudnn_not_satisfied(min_version='7.2.1')
	def test_layer_bidirectional_proj():
	check_layer_bidirectional(7, 5, 3)


	@with_seed()
	@assert_raises_cudnn_not_satisfied(min_version='5.1.10')
	def test_rnn_layer_begin_state_type():
	fake_data = nd.random.uniform(shape=(3, 5, 7), dtype='float16')
	modeling_layer = gluon.rnn.LSTM(
	hidden_size=11, num_layers=2, dropout=0.2, bidirectional=True)
	modeling_layer.cast('float16')
	modeling_layer.initialize()
	modeling_layer(fake_data)


	def test_gluon_ctc_consistency():
	loss = mx.gluon.loss.CTCLoss()
	data = mx.nd.arange(0, 4, repeat=40, ctx=mx.gpu(0)
	).reshape((2, 20, 4)).flip(axis=0)
	cpu_label = mx.nd.array([[2, 1, -1, -1], [3, 2, 2, -1]], ctx=mx.cpu(0))
	gpu_label = mx.nd.array([[2, 1, -1, -1], [3, 2, 2, -1]], ctx=mx.gpu(0))

	cpu_data = data.copy().as_in_context(mx.cpu(0))
	cpu_data.attach_grad()
	with mx.autograd.record():
	l_cpu = loss(cpu_data, cpu_label)
	l_cpu.backward()

	gpu_data = data.copyto(mx.gpu(0))
	gpu_data.attach_grad()
	with mx.autograd.record():
	l_gpu = loss(gpu_data, gpu_label)
	l_gpu.backward()

	assert_almost_equal(cpu_data.grad.asnumpy(),
	gpu_data.grad.asnumpy(), atol=1e-3, rtol=1e-3)


	@with_seed()
	def test_global_norm_clip_multi_device():
	for check_isfinite in [True, False]:
	x1 = mx.nd.ones((3, 3), ctx=mx.gpu(0))
	x2 = mx.nd.ones((4, 4), ctx=mx.cpu(0))
	norm = gluon.utils.clip_global_norm(
	[x1, x2], 1.0, check_isfinite=check_isfinite)
	if check_isfinite:
	assert norm == 5.0
	else:
	assert norm.asscalar() == 5.0
	assert_almost_equal(x1.asnumpy(), np.ones((3, 3)) / 5)
	assert_almost_equal(x2.asnumpy(), np.ones((4, 4)) / 5)


	@with_seed()
	def test_symbol_block_fp16():
	# Test case to verify if initializing the SymbolBlock from a model with params
	# other than fp32 param dtype.

	# 1. Load a resnet model, cast it to fp16 and export
	tmp = tempfile.mkdtemp()
	tmpfile = os.path.join(tmp, 'resnet34_fp16')
	ctx = mx.gpu(0)

	net_fp32 = mx.gluon.model_zoo.vision.resnet34_v2(
	pretrained=True, ctx=ctx, root=tmp)
	net_fp32.cast('float16')
	net_fp32.hybridize()
	data = mx.nd.zeros((1, 3, 224, 224), dtype='float16', ctx=ctx)
	net_fp32.forward(data)
	net_fp32.export(tmpfile, 0)

	# 2. Load the saved model and verify if all the params are loaded correctly.
	# and choose one of the param to verify the type if fp16.
	sm = mx.sym.load(tmpfile + '-symbol.json')
	inputs = mx.sym.var('data', dtype='float16')
	net_fp16 = mx.gluon.SymbolBlock(sm, inputs)
	net_fp16.collect_params().load(tmpfile + '-0000.params', ctx=ctx)
	# 3. Get a conv layer's weight parameter name. Conv layer's weight param is
	# expected to be of dtype casted, fp16.
	for param_name in net_fp16.params.keys():
	if 'conv' in param_name and 'weight' in param_name:
	break
	assert np.dtype(net_fp16.params[param_name].dtype) == np.dtype(np.float16)


	@with_seed()
	def test_large_models():
	ctx = default_context()
	# Create model
	net = gluon.nn.HybridSequential()

	largest_num_features = 256
	with net.name_scope():
	net.add(nn.Conv2D(largest_num_features, 3))

	net.hybridize()
	net.initialize(mx.init.Normal(sigma=0.01), ctx=ctx)

	# Compute the height (=width) of the square tensor of the given size in bytes
	def tensor_size(big_tensor_bytes):
	bytes_per_float = 4
	sz = int(math.sqrt(big_tensor_bytes /
	largest_num_features / bytes_per_float))
	return (sz // 100) * 100

	# The idea is to create models with large tensors of (say) 20% of the total memory.
	# This in the past has given cudnnFind() trouble when it needed to allocate similar I/O's
	# from the area carved out by the MXNET_GPU_MEM_POOL_RESERVE setting (by default 5%).
	(free_mem_bytes, total_mem_bytes) = mx.context.gpu_memory_info(ctx.device_id)
	start_size = tensor_size(0.20 * total_mem_bytes)
	num_trials = 10
	sys.stderr.write(
	' testing global memory of size {} ... '.format(total_mem_bytes))
	sys.stderr.flush()
	for i in range(num_trials):
	sz = start_size - 10 * i
	(height, width) = (sz, sz)
	sys.stderr.write(" {}x{} ".format(height, width))
	sys.stderr.flush()
	data_in = nd.random_uniform(low=0, high=255, shape=(1, 3, height, width),
	ctx=ctx, dtype="float32")
	# Evaluate model
	net(data_in).asnumpy()

	# isolated execution bulking test function to be invoked with different env var settings


	def _test_bulking_in_process(seed, time_per_iteration):
	# Use flip since it's a simple function with same-sized I/O unlikely to ever be fused.
	class Flip(gluon.HybridBlock):
	def __init__(self, **kwargs):
	super(Flip, self).__init__(**kwargs)

	def hybrid_forward(self, F, x):
	return F.flip(x, axis=0)

	def get_net(num_ops):
	net = nn.HybridSequential()
	with net.name_scope():
	for _ in range(num_ops):
	net.add(Flip())
	return net

	data_shape = (10,)
	num_ops = 1000
	num_iterations = 20

	# build model
	x = mx.ndarray.zeros(data_shape)
	x.attach_grad()
	dy = mx.ndarray.ones(data_shape)
	net = get_net(num_ops)
	net.hybridize(static_alloc=True, static_shape=True)

	# time a number of forward() and backward() executions after some warm-up iterations
	warmups = 1
	for i in range(num_iterations + warmups):
	with autograd.record():
	if i == warmups:
	start = time.time()
	y = net(x)
	y.backward(dy)
	x.grad.wait_to_read()

	time_per_iteration.value = (time.time() - start) / num_iterations


	@with_seed()
	def test_bulking():
	# test case format: (max_fwd_segment_size, max_bwd_segment_size, enable_bulking_in_training)
	test_cases = [(0, 0, True), (1, 1, True), (15, 15, False),
	(15, 0, True), (0, 15, True), (15, 15, True)]
	times = {}
	times_str = ''
	for seg_sizes in test_cases:
	# Create shared variable to return measured time from test process
	time_per_iteration = mp.Manager().Value('d', 0.0)
	if not run_in_spawned_process(_test_bulking_in_process,
	{'MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN_FWD': seg_sizes[0],
	'MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN_BWD': seg_sizes[1],
	'MXNET_EXEC_BULK_EXEC_TRAIN': seg_sizes[2]},
	time_per_iteration):
	# skip test since the python version can't run it properly. Warning msg was logged.
	return
	times[seg_sizes] = time_per_iteration.value
	times_str += \
	'\n runtime of (fwd,bwd,enable) op seg setting ({},{},{}) =\t{:.1f} msec'.format(
	seg_sizes[0], seg_sizes[1], seg_sizes[2], 1000.0 * times[seg_sizes])

	fastest_non_bulked_time = min(
	times[(0, 0, True)], times[(1, 1, True)], times[(15, 15, False)])
	slowest_half_bulked_time = max(times[(0, 15, True)], times[(15, 0, True)])
	fastest_half_bulked_time = min(times[(0, 15, True)], times[(15, 0, True)])
	fully_bulked_time = times[(15, 15, True)]

	print(times_str)
	# Non-bulked times[0,0,True], times[1,1,True] and times[15,15,False] should be about the same,
	# slower than both half-bulked times[0,15,True] and times[15,0,True]
	assert slowest_half_bulked_time < fastest_non_bulked_time, \
	'A half-bulked exec time is slower than the non-bulked time by {} secs! {}' \
	.format(slowest_half_bulked_time - fastest_non_bulked_time, times_str)
	# The fully bulked times[15,15,True] should be faster than both half-bulked runs
	assert fully_bulked_time < fastest_half_bulked_time, \
	'The fully-bulked exec time is slower than a half-bulked time by {} secs! {}' \
	.format(fully_bulked_time - fastest_half_bulked_time, times_str)


	if __name__ == '__main__':
	import nose
	nose.runmodule()