| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| |
| import sys |
| import os |
| import time |
| import random |
| import mxnet as mx |
| import multiprocessing as mp |
| from mxnet.test_utils import check_consistency, set_default_device, assert_almost_equal, rand_ndarray, environment |
| import numpy as _np |
| import math |
| from mxnet import autograd |
| import pytest |
| |
| curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__))) |
| sys.path.insert(0, os.path.join(curr_path, '../unittest')) |
| from common import assert_raises_cudnn_not_satisfied, run_in_spawned_process, random_seed |
| from test_gluon import * |
| from test_loss import * |
| from test_numpy_loss import * |
| from test_gluon_rnn import * |
| |
| set_default_device(mx.gpu(0)) |
| |
| |
| def check_rnn_layer(layer): |
| layer.initialize(device=[mx.cpu(0), mx.gpu(0)]) |
| with mx.gpu(0): |
| x = mx.np.ones((10, 16, 30)) |
| states = layer.begin_state(16) |
| go, gs = layer(x, states) |
| |
| with mx.cpu(0): |
| x = mx.np.ones((10, 16, 30)) |
| states = layer.begin_state(16) |
| co, cs = layer(x, states) |
| |
| assert_almost_equal(go, co) |
| for g, c in zip(gs, cs): |
| assert_almost_equal(g, c) |
| |
| |
| def check_rnn_layer_w_rand_inputs(layer): |
| layer.initialize(device=[mx.cpu(0), mx.gpu(0)]) |
| x = mx.np.random.uniform(size=(10, 16, 30)) |
| with mx.gpu(0): |
| x = x.copyto(mx.gpu(0)) |
| states = layer.begin_state(16) |
| go, gs = layer(x, states) |
| |
| with mx.cpu(0): |
| x = x.copyto(mx.cpu(0)) |
| states = layer.begin_state(16) |
| co, cs = layer(x, states) |
| |
| assert_almost_equal(go, co) |
| for g, c in zip(gs, cs): |
| assert_almost_equal(g, c) |
| |
| |
| @mx.util.use_np |
| @assert_raises_cudnn_not_satisfied(min_version='7.2.1') |
| def test_lstmp(): |
| hidden_size, projection_size = 3, 2 |
| rtol, atol = 1e-2, 1e-2 |
| batch_size, seq_len = 7, 11 |
| input_size = 5 |
| device = mx.gpu(0) |
| lstm_input = mx.np.random.uniform( |
| size=(seq_len, batch_size, input_size), device=device) |
| shapes = {'i2h_weight': (hidden_size * 4, input_size), |
| 'h2h_weight': (hidden_size * 4, projection_size), |
| 'i2h_bias': (hidden_size * 4,), |
| 'h2h_bias': (hidden_size * 4,), |
| 'h2r_weight': (projection_size, hidden_size)} |
| weights = {k: rand_ndarray(v).as_np_ndarray() for k, v in shapes.items()} |
| lstm_layer = gluon.rnn.LSTM(hidden_size, projection_size=projection_size, |
| input_size=input_size) |
| lstm_cell = gluon.rnn.LSTMPCell(hidden_size=hidden_size, |
| projection_size=projection_size, |
| input_size=input_size) |
| lstm_layer.initialize(device=device) |
| lstm_cell.initialize(device=device) |
| layer_params = lstm_layer.collect_params() |
| cell_params = lstm_cell.collect_params() |
| params = (weights['{}_{}'.format(g, t)].reshape(-1) |
| for t in ['weight', 'bias'] |
| for g in ['i2h', 'h2h', 'h2r'] |
| if g != 'h2r' or t != 'bias') |
| |
| net_params_concat = mx.np.concatenate(params) |
| layer_params['rnn_param'].set_data(net_params_concat) |
| for k, v in weights.items(): |
| cell_params[k].set_data(v) |
| with autograd.record(): |
| layer_output = lstm_layer(lstm_input.copy()) |
| cell_output = lstm_cell.unroll(seq_len, lstm_input.copy(), layout='TNC', |
| merge_outputs=True)[0] |
| |
| assert_almost_equal(layer_output, cell_output, rtol=rtol, atol=atol) |
| layer_output.backward() |
| cell_output.backward() |
| layer_params_split = split_rnn_params(layer_params['rnn_param'].grad(),\ |
| 'lstm', 1, input_size, hidden_size, False, projection_size=projection_size) |
| for k, _ in weights.items(): |
| layer_grad = layer_params_split['l0_' + k] |
| cell_grad = cell_params[k].grad() |
| print('checking gradient for {}'.format('lstm0_l0_' + k)) |
| assert_almost_equal(layer_grad, cell_grad, rtol=rtol, atol=atol) |
| check_rnn_layer_forward(gluon.rnn.LSTM( |
| 10, 2, projection_size=5), mx.np.ones((8, 3, 20)), device=device) |
| check_rnn_layer_forward(gluon.rnn.LSTM(10, 2, projection_size=5, bidirectional=True), mx.np.ones( |
| (8, 3, 20)), [mx.np.ones((4, 3, 5)), mx.np.ones((4, 3, 10))], device=device) |
| check_rnn_layer_forward(gluon.rnn.LSTM(10, 2, dropout=0.5, projection_size=5), mx.np.ones((8, 3, 20)), |
| run_only=True, device=device) |
| check_rnn_layer_forward(gluon.rnn.LSTM(10, 2, bidirectional=True, dropout=0.5, projection_size=5), |
| mx.np.ones((8, 3, 20)), |
| [mx.np.ones((4, 3, 5)), mx.np.ones((4, 3, 10))], run_only=True, device=device) |
| lstm_layer.save_parameters('gpu_tmp.params') |
| lstm_layer.load_parameters('gpu_tmp.params') |
| |
| |
| @assert_raises_cudnn_not_satisfied(min_version='7.2.1') |
| @pytest.mark.flaky |
| def test_lstm_clip(): |
| hidden_size, projection_size = 4096, 2048 |
| batch_size, seq_len = 32, 80 |
| input_size = 50 |
| clip_min, clip_max, clip_nan = -5, 5, True |
| lstm_input = mx.np.random.uniform( |
| size=(seq_len, batch_size, input_size), device=mx.gpu(0)) |
| lstm_states = [mx.np.random.uniform(size=(2, batch_size, projection_size), device=mx.gpu(0)), |
| mx.np.random.uniform(size=(2, batch_size, hidden_size), device=mx.gpu(0))] |
| lstm_layer = gluon.rnn.LSTM(hidden_size, projection_size=projection_size, |
| input_size=input_size, |
| bidirectional=True, |
| state_clip_min=clip_min, |
| state_clip_max=clip_max, |
| state_clip_nan=clip_nan) |
| lstm_layer.initialize(device=mx.gpu(0)) |
| with autograd.record(): |
| _, layer_output_states = lstm_layer(lstm_input, lstm_states) |
| cell_states = layer_output_states[0] |
| assert (cell_states >= clip_min).all() and (cell_states <= clip_max).all() |
| assert not _np.isnan(cell_states).any() |
| |
| |
| @assert_raises_cudnn_not_satisfied(min_version='5.1.10') |
| def test_rnn_layer(): |
| check_rnn_layer(gluon.rnn.RNN(100, num_layers=3)) |
| check_rnn_layer(gluon.rnn.RNN(100, activation='tanh', num_layers=3)) |
| check_rnn_layer(gluon.rnn.LSTM(100, num_layers=3)) |
| check_rnn_layer(gluon.rnn.GRU(100, num_layers=3)) |
| |
| check_rnn_layer(gluon.rnn.LSTM(100, num_layers=3, bidirectional=True)) |
| check_rnn_layer_w_rand_inputs(gluon.rnn.LSTM( |
| 100, num_layers=3, bidirectional=True)) |
| |
| |
| @mx.util.use_np |
| def check_layer_bidirectional(size, in_size, proj_size): |
| class RefBiLSTM(gluon.Block): |
| def __init__(self, size, proj_size, **kwargs): |
| super(RefBiLSTM, self).__init__(**kwargs) |
| self._lstm_fwd = gluon.rnn.LSTM( |
| size, projection_size=proj_size, bidirectional=False) |
| self._lstm_bwd = gluon.rnn.LSTM( |
| size, projection_size=proj_size, bidirectional=False) |
| |
| def forward(self, inpt): |
| fwd = self._lstm_fwd(inpt) |
| bwd_inpt = mx.np.flip(inpt, 0) |
| bwd = self._lstm_bwd(bwd_inpt) |
| bwd = mx.np.flip(bwd, 0) |
| return mx.np.concatenate([fwd, bwd], axis=2) |
| weights = {} |
| for d in ['l', 'r']: |
| weights['{}0_i2h_weight'.format(d)] = mx.np.random.uniform( |
| size=(size * 4, in_size)) |
| if proj_size: |
| weights['{}0_h2h_weight'.format(d)] = mx.np.random.uniform( |
| size=(size * 4, proj_size)) |
| weights['{}0_h2r_weight'.format(d)] = mx.np.random.uniform( |
| size=(proj_size, size)) |
| else: |
| weights['{}0_h2h_weight'.format( |
| d)] = mx.np.random.uniform(size=(size * 4, size)) |
| weights['{}0_i2h_bias'.format( |
| d)] = mx.np.random.uniform(size=(size * 4,)) |
| weights['{}0_h2h_bias'.format( |
| d)] = mx.np.random.uniform(size=(size * 4,)) |
| |
| if proj_size: |
| params = (weights['{}0_{}_{}'.format(d, g, t)].reshape(-1) |
| for t in ['weight', 'bias'] |
| for d in ['l', 'r'] |
| for g in ['i2h', 'h2h', 'h2r'] |
| if g != 'h2r' or t != 'bias') |
| else: |
| params = (weights['{}0_{}_{}'.format(d, g, t)].reshape(-1) |
| for t in ['weight', 'bias'] |
| for d in ['l', 'r'] |
| for g in ['i2h', 'h2h']) |
| |
| net_params_concat = mx.np.concatenate(params) |
| if proj_size: |
| params_left = (weights['l0_{}_{}'.format(g, t)].reshape(-1) |
| for t in ['weight', 'bias'] |
| for g in ['i2h', 'h2h', 'h2r'] |
| if g != 'h2r' or t != 'bias') |
| else: |
| params_left = (weights['l0_{}_{}'.format(g, t)].reshape(-1) |
| for t in ['weight', 'bias'] |
| for g in ['i2h', 'h2h']) |
| if proj_size: |
| params_right = (weights['r0_{}_{}'.format(g, t)].reshape(-1) |
| for t in ['weight', 'bias'] |
| for g in ['i2h', 'h2h', 'h2r'] |
| if g != 'h2r' or t != 'bias') |
| else: |
| params_right = (weights['r0_{}_{}'.format(g, t)].reshape(-1) |
| for t in ['weight', 'bias'] |
| for g in ['i2h', 'h2h']) |
| net_ref_left_params = mx.np.concatenate(params_left) |
| net_ref_right_params = mx.np.concatenate(params_right) |
| net = gluon.rnn.LSTM(size, projection_size=proj_size, |
| bidirectional=True) |
| ref_net = RefBiLSTM(size, proj_size) |
| net.initialize() |
| ref_net.initialize() |
| net_params = net.collect_params() |
| ref_net_params = ref_net.collect_params() |
| net_params['rnn_param'].set_data(net_params_concat) |
| ref_net_params['_lstm_fwd.rnn_param'].set_data(net_ref_left_params) |
| ref_net_params['_lstm_bwd.rnn_param'].set_data(net_ref_right_params) |
| |
| data = mx.np.random.uniform(size=(11, 10, in_size)) |
| mx.test_utils.assert_allclose(net(data), ref_net(data), rtol=1e-6) |
| |
| |
| |
| def check_layer_bidirectional_varseqlen(size, in_size): |
| weight = mx.np.random.uniform(size=(784,)) |
| |
| net = gluon.rnn.LSTM(size, bidirectional=True, use_sequence_length=True) |
| ref_net = gluon.rnn.LSTM(size, bidirectional=True, use_sequence_length=False) |
| net.initialize() |
| ref_net.initialize() |
| net_params = net.collect_params() |
| ref_net_params = ref_net.collect_params() |
| net_params['rnn_param'].set_data(weight) |
| ref_net_params['rnn_param'].set_data(weight) |
| |
| batch_size = 10 |
| num_timesteps = 11 |
| data = mx.np.random.uniform(size=(num_timesteps, batch_size, in_size)) |
| data_np = data.asnumpy() |
| |
| sequence_length = mx.np.random.randint(1, num_timesteps+1, size=(batch_size)).astype("int32") |
| sequence_length_np = sequence_length.asnumpy().astype("int32") |
| |
| # Reference net is processing batch elements one at a time, so that it is "perfectly sized" |
| # Because of that, we need to accumulate gradients in reference net. |
| for p in ref_net.collect_params().values(): |
| p.grad_req = 'add' |
| |
| ref_net_output = [] |
| with autograd.record(): |
| net_output = net(data.copy(), sequence_length=sequence_length.copy()) |
| |
| for b in range(batch_size): |
| data_slice = mx.np.array(data_np[:sequence_length_np[b], b, :]).reshape(sequence_length_np[b], 1, in_size) |
| ref_output_slice = ref_net(data_slice) |
| ref_net_output.append(ref_output_slice) |
| |
| net_output_np = net_output.asnumpy() |
| |
| # TODO: test state return value as well output |
| # Only compare the valid sections for each batch entry |
| for b in range(batch_size): |
| assert_allclose(net_output_np[:sequence_length_np[b], b], ref_net_output[b].asnumpy().squeeze(1), |
| rtol=1e-2, atol=1e-6) |
| |
| # Now test backward |
| net_output.backward() |
| |
| for ref_output_slice in ref_net_output: |
| ref_output_slice.backward() |
| |
| ref_net_params = ref_net.collect_params() |
| |
| net_grad = net_params['rnn_param'].grad() |
| ref_net_grad = ref_net_params['rnn_param'].grad() |
| assert_almost_equal(net_grad.asnumpy(), ref_net_grad.asnumpy(), |
| rtol=1e-2, atol=1e-6) |
| |
| |
| @assert_raises_cudnn_not_satisfied(min_version='5.1.10') |
| def test_layer_bidirectional(): |
| check_layer_bidirectional(7, 5, 0) |
| |
| |
| @assert_raises_cudnn_not_satisfied(min_version='7.2.1') |
| def test_layer_bidirectional_proj(): |
| check_layer_bidirectional(7, 5, 3) |
| |
| @assert_raises_cudnn_not_satisfied(min_version='7.2.1') |
| def test_layer_bidirectional_varseqlength(): |
| check_layer_bidirectional_varseqlen(7, 5) |
| |
| |
| @assert_raises_cudnn_not_satisfied(min_version='5.1.10') |
| def test_rnn_layer_begin_state_type(): |
| fake_data = mx.np.random.uniform(size=(3, 5, 7), dtype='float16') |
| modeling_layer = gluon.rnn.LSTM( |
| hidden_size=11, num_layers=2, dropout=0.2, bidirectional=True) |
| modeling_layer.cast('float16') |
| modeling_layer.initialize() |
| modeling_layer(fake_data) |
| |
| |
| def test_gluon_ctc_consistency(): |
| loss = mx.gluon.loss.CTCLoss() |
| data = mx.np.flip(mx.np.repeat(mx.np.arange(0, 4, device=mx.gpu(0)), 40).reshape((2, 20, 4)), axis=0) |
| cpu_label = mx.np.array([[2, 1, -1, -1], [3, 2, 2, -1]], device=mx.cpu(0)) |
| gpu_label = mx.np.array([[2, 1, -1, -1], [3, 2, 2, -1]], device=mx.gpu(0)) |
| |
| cpu_data = data.copy().to_device(mx.cpu(0)) |
| cpu_data.attach_grad() |
| with mx.autograd.record(): |
| l_cpu = loss(cpu_data, cpu_label) |
| l_cpu.backward() |
| |
| gpu_data = data.copyto(mx.gpu(0)) |
| gpu_data.attach_grad() |
| with mx.autograd.record(): |
| l_gpu = loss(gpu_data, gpu_label) |
| l_gpu.backward() |
| |
| assert_almost_equal(cpu_data.grad, gpu_data.grad, atol=1e-3, rtol=1e-3) |
| |
| |
| def test_global_norm_clip_multi_device(): |
| for check_isfinite in [True, False]: |
| x1 = mx.np.ones((3, 3), device=mx.gpu(0)) |
| x2 = mx.np.ones((4, 4), device=mx.cpu(0)) |
| x3 = mx.np.ones((7, 4), device=mx.gpu(0)) |
| x4 = mx.np.ones((7, 4), device=mx.cpu(0)) |
| norm = gluon.utils.clip_global_norm( |
| [x1, x2, x3, x4], 1.0, check_isfinite=check_isfinite) |
| if check_isfinite: |
| assert norm == 9.0 |
| else: |
| assert norm.item() == 9.0 |
| assert_almost_equal(x1, _np.ones((3, 3)) / 9) |
| assert_almost_equal(x2, _np.ones((4, 4)) / 9) |
| assert_almost_equal(x3, _np.ones((7, 4)) / 9) |
| assert_almost_equal(x4, _np.ones((7, 4)) / 9) |
| |
| |
| def _check_batchnorm_result(input, num_devices=1, cuda=False): |
| from mxnet.gluon.utils import split_and_load |
| def _find_bn(module): |
| if isinstance(module, (mx.gluon.nn.BatchNorm, mx.gluon.nn.SyncBatchNorm)): |
| return module |
| elif isinstance(module.module, (mx.gluon.nn.BatchNorm, mx.gluon.nn.SyncBatchNorm)): |
| return module.module |
| |
| raise RuntimeError('BN not found') |
| |
| def _syncParameters(bn1, bn2, device): |
| device = input.context |
| bn2.gamma.set_data(bn1.gamma.data(device)) |
| bn2.beta.set_data(bn1.beta.data(device)) |
| bn2.running_mean.set_data(bn1.running_mean.data(device)) |
| bn2.running_var.set_data(bn1.running_var.data(device)) |
| |
| input1 = input.copy() |
| input2 = input.copy() |
| |
| if cuda: |
| input1 = input.to_device(mx.gpu(0)) |
| device_list = [mx.gpu(i) for i in range(num_devices)] |
| else: |
| device_list = [mx.cpu(0) for _ in range(num_devices)] |
| |
| nch = input.shape[1] |
| bn1 = mx.gluon.nn.BatchNorm(in_channels=nch) |
| bn2 = mx.gluon.nn.SyncBatchNorm(in_channels=nch, num_devices=num_devices) |
| |
| bn1.initialize(device=device_list[0]) |
| bn2.initialize(device=device_list) |
| |
| # using the same values for gamma and beta |
| #_syncParameters(_find_bn(bn1), _find_bn(bn2), device_list[0]) |
| |
| input1.attach_grad() |
| inputs2 = split_and_load(input2, device_list, batch_axis=0) |
| for xi in inputs2: |
| xi.attach_grad() |
| |
| with mx.autograd.record(): |
| output1 = bn1(input1) |
| output2 = [bn2(xi) for xi in inputs2] |
| loss1 = (output1 ** 2).sum() |
| loss2 = [(output ** 2).sum() for output in output2] |
| mx.autograd.backward(loss1) |
| mx.autograd.backward(loss2) |
| |
| output2 = mx.np.concatenate([output.to_device(input.context) for output in output2], axis=0) |
| # assert forwarding |
| assert_almost_equal(input1, input2, atol=1e-3, rtol=1e-3) |
| assert_almost_equal(output1, output2, atol=1e-3, rtol=1e-3) |
| assert_almost_equal(_find_bn(bn1).running_mean.data(device_list[0]), |
| _find_bn(bn2).running_mean.data(device_list[0]), |
| atol=1e-3, rtol=1e-3) |
| assert_almost_equal(_find_bn(bn1).running_var.data(device_list[0]), |
| _find_bn(bn2).running_var.data(device_list[0]), |
| atol=1e-3, rtol=1e-3) |
| input2grad = mx.np.concatenate([output.grad.to_device(input.context) for output in inputs2], axis=0) |
| assert_almost_equal(input1.grad, input2grad, atol=1e-3, rtol=1e-3) |
| |
| @mx.util.use_np |
| def test_sync_batchnorm(): |
| def get_num_devices(): |
| for i in range(100): |
| try: |
| mx.np.zeros((1,), device=mx.gpu(i)) |
| except: |
| return i |
| # no need to use SyncBN with 1 gpu |
| if get_num_devices() < 2: |
| return |
| ndev = 2 |
| # check with unsync version |
| for _ in range(10): |
| _check_batchnorm_result(mx.np.random.uniform(size=(4, 1, 4, 4)), |
| num_devices=ndev, cuda=True) |
| |
| def test_symbol_block_fp16(tmpdir): |
| # Test case to verify if initializing the SymbolBlock from a model with params |
| # other than fp32 param dtype. |
| |
| # 1. Load a resnet model, cast it to fp16 and export |
| tmp = str(tmpdir) |
| tmpfile = os.path.join(tmp, 'resnet34_fp16') |
| device = mx.gpu(0) |
| |
| net_fp32 = mx.gluon.model_zoo.vision.resnet34_v2( |
| pretrained=True, device=device, root=tmp) |
| net_fp32.cast('float16') |
| net_fp32.hybridize() |
| data = mx.np.zeros((1, 3, 224, 224), dtype='float16', device=device) |
| net_fp32(data) |
| symbol_file, param_file = net_fp32.export(tmpfile, 0) |
| |
| # 2. Load the saved model and verify if all the params are loaded correctly. |
| # Choose one of the parameters to verify the type is fp16. |
| sm = mx.sym.load(symbol_file) |
| inputs = mx.sym.var('data', dtype='float16') |
| net_fp16 = mx.gluon.SymbolBlock(sm, inputs) |
| net_fp16.load_parameters(param_file, device=device) |
| # 3. Get a conv layer's weight parameter name. Conv layer's weight param is |
| # expected to be of dtype casted, fp16. |
| name = None |
| for param_name in net_fp32.collect_params().keys(): |
| if 'conv' in param_name and 'weight' in param_name: |
| name = param_name |
| break |
| assert _np.dtype(net_fp16.params[name].dtype) == _np.dtype(_np.float16) |
| |
| |
| @pytest.mark.serial |
| def test_large_models(): |
| device = default_device() |
| # Create model |
| net = gluon.nn.HybridSequential() |
| |
| largest_num_features = 256 |
| net.add(nn.Conv2D(largest_num_features, 3)) |
| |
| net.hybridize() |
| net.initialize(mx.init.Normal(sigma=0.01), device=device) |
| |
| # Compute the height (=width) of the square tensor of the given size in bytes |
| def tensor_size(big_tensor_bytes): |
| bytes_per_float = 4 |
| sz = int(math.sqrt(big_tensor_bytes / |
| largest_num_features / bytes_per_float)) |
| return (sz // 100) * 100 |
| |
| # The idea is to create models with large tensors of (say) 20% of the total memory. |
| # This in the past has given cudnnFind() trouble when it needed to allocate similar I/O's |
| # from the area carved out by the MXNET_GPU_MEM_POOL_RESERVE setting (by default 5%). |
| (free_mem_bytes, total_mem_bytes) = mx.device.gpu_memory_info(device.device_id) |
| # This test needs to be 'qualified' for use with each new larger memory size |
| largest_supported_total_mem_GB = 32 |
| if (total_mem_bytes > largest_supported_total_mem_GB * 1024 * 1024 * 1024): |
| sys.stderr.write( |
| ' bypassing test due to too-large global memory of size {} ... '.format(total_mem_bytes)) |
| return |
| |
| start_size = tensor_size(0.20 * total_mem_bytes) |
| num_trials = 10 |
| sys.stderr.write( |
| ' testing global memory of size {} ... '.format(total_mem_bytes)) |
| sys.stderr.flush() |
| for i in range(num_trials): |
| sz = start_size - 10 * i |
| (height, width) = (sz, sz) |
| sys.stderr.write(" {}x{} ".format(height, width)) |
| sys.stderr.flush() |
| data_in = mx.np.random.uniform(low=0, high=255, size=(1, 3, height, width), |
| device=device, dtype="float32") |
| # Evaluate model |
| net(data_in).asnumpy() |
| |
| |
| @mx.util.use_np |
| def test_hybridblock_mix_device_raise(): |
| class FooHybrid(gluon.HybridBlock): |
| def forward(self, a, b): |
| if isinstance(a, (list, tuple)): |
| a = sum(a) |
| if isinstance(b, (list, tuple)): |
| b = sum(b) |
| return a + b |
| foo_hybrid = FooHybrid() |
| foo_hybrid.hybridize() |
| pytest.raises(ValueError, lambda: foo_hybrid(mx.np.ones((10,), device=mx.gpu()), |
| mx.np.ones((10,), device=mx.cpu()))) |
| |
| |
| @mx.util.use_np |
| def test_gemms_true_fp16(): |
| device = mx.gpu(0) |
| input = mx.np.random.uniform(size=(1, 512), dtype='float16', device=device) |
| weights = mx.np.random.uniform(size=(128, 512), device=device) |
| |
| net = nn.Dense(128, in_units=512, use_bias=False) |
| net.cast('float16') |
| net.initialize(device=device) |
| net.weight.set_data(weights) |
| |
| with environment('MXNET_FC_TRUE_FP16', '0'): |
| ref_results = net(input) |
| |
| with environment('MXNET_FC_TRUE_FP16', '1'): |
| results_trueFP16 = net(input) |
| |
| atol = 1e-2 |
| rtol = 1e-2 |
| assert_almost_equal(ref_results.asnumpy(), results_trueFP16.asnumpy(), |
| atol=atol, rtol=rtol) |
| |
| @mx.util.use_np |
| def test_cudnn_dropout_reproducibility(): |
| d = nn.Dropout(0.5) |
| d.initialize() |
| a = mx.np.random.uniform(size=(100,100)) |
| b = a.copy() |
| a.attach_grad() |
| b.attach_grad() |
| seed = mx.np.random.randint(0, 100000).item() |
| N = 10 |
| mx.np.random.seed(seed) |
| out1 = [] |
| for _ in range(N): |
| with autograd.record(): |
| out1.append(d(a)) |
| out1[0].backward() |
| mx.np.random.seed(seed) |
| out2 = [] |
| for _ in range(N): |
| with autograd.record(): |
| out2.append(d(b)) |
| out2[0].backward() |
| |
| for first, second in zip(out1, out2): |
| assert_almost_equal(first, second) |
| |
| assert_almost_equal(a.grad, b.grad) |
| |
| @mx.util.use_np |
| def test_cuda_graphs(): |
| class GraphTester(gluon.HybridBlock): |
| def __init__(self, function_to_test, **kwargs): |
| super(GraphTester, self).__init__(**kwargs) |
| self.f = function_to_test() |
| |
| def forward(self, *args): |
| # We need to isolate the operation to be fully inside the graph |
| # in order for graphs usage to be possible |
| copied_args = [mx.np.copy(a) for a in args] |
| outputs = self.f(*copied_args) |
| if isinstance(outputs, (list, tuple)): |
| return [mx.np.copy(o) for o in outputs] |
| else: |
| return mx.np.copy(outputs) |
| |
| class TestDesc: |
| def __init__(self, name, f, num_inputs=1, input_dim=4): |
| self.name = name |
| self.f = f |
| self.num_inputs = num_inputs |
| self.input_dim = input_dim |
| |
| def generate_inputs(self): |
| shape = tuple(_np.random.randint(4, 11, size=self.input_dim)) |
| ret = [mx.np.random.uniform(size=shape) for _ in range(self.num_inputs)] |
| for r in ret: |
| r.attach_grad() |
| return ret |
| |
| tested_ops = [ |
| TestDesc('add', lambda: (lambda x, y: x + y), num_inputs = 2), |
| TestDesc('add_scalar', lambda: (lambda x: x + 0.5)), |
| TestDesc('Conv', lambda: mx.gluon.nn.Conv2D(channels=32, kernel_size=(1,1))), |
| TestDesc('ConvTranspose', lambda: mx.gluon.nn.Conv2DTranspose(channels=32, kernel_size=(1,1))), |
| TestDesc('Dense', lambda: mx.gluon.nn.Dense(units=128)), |
| TestDesc('Activation', lambda: mx.gluon.nn.Activation('tanh')), |
| TestDesc('Dropout', lambda: mx.gluon.nn.Dropout(0.5)), |
| TestDesc('Flatten', lambda: mx.gluon.nn.Flatten()), |
| TestDesc('MaxPool', lambda: mx.gluon.nn.MaxPool2D()), |
| TestDesc('AvgPool', lambda: mx.gluon.nn.AvgPool2D()), |
| TestDesc('GlobalMaxPool', lambda: mx.gluon.nn.GlobalMaxPool2D()), |
| TestDesc('GlobalAvgPool', lambda: mx.gluon.nn.GlobalAvgPool2D()), |
| TestDesc('ReflectionPad2D', lambda: mx.gluon.nn.ReflectionPad2D()), |
| TestDesc('BatchNorm', lambda: mx.gluon.nn.BatchNorm()), |
| TestDesc('InstanceNorm', lambda: mx.gluon.nn.InstanceNorm()), |
| TestDesc('LayerNorm', lambda: mx.gluon.nn.LayerNorm()), |
| TestDesc('LeakyReLU', lambda: mx.gluon.nn.LeakyReLU(0.1)), |
| TestDesc('PReLU', lambda: mx.gluon.nn.PReLU()), |
| TestDesc('ELU', lambda: mx.gluon.nn.ELU()), |
| TestDesc('SELU', lambda: mx.gluon.nn.SELU()), |
| TestDesc('Swish', lambda: mx.gluon.nn.Swish()), |
| ] |
| |
| N = 10 |
| |
| with environment({'MXNET_ENABLE_CUDA_GRAPHS': '1', |
| 'MXNET_USE_FUSION': '0'}): |
| device = mx.gpu(0) |
| for test_desc in tested_ops: |
| print("Testing ", test_desc.name) |
| inputs = test_desc.generate_inputs() |
| inputsg = [i.copy() for i in inputs] |
| for i in inputsg: |
| i.attach_grad() |
| seed = random.randint(0, 10000) |
| net = GraphTester(test_desc.f) |
| netg = GraphTester(test_desc.f) |
| |
| # initialize parameters |
| net.initialize(device=device) |
| netg.initialize(device=device) |
| |
| net(*inputs) |
| |
| for p1, p2 in zip(net.collect_params().values(), netg.collect_params().values()): |
| p2.set_data(p1.data()) |
| |
| netg.hybridize(static_alloc=True, static_shape=True) |
| |
| print("Testing inference mode") |
| with random_seed(seed): |
| for _ in range(N): |
| assert_almost_equal(net(*inputs), netg(*inputsg)) |
| |
| mx.npx.waitall() |
| print("Testing training mode") |
| for _ in range(N): |
| with random_seed(seed): |
| with mx.autograd.record(): |
| out = net(*inputs) |
| out.backward() |
| |
| with random_seed(seed): |
| with mx.autograd.record(): |
| outg = netg(*inputsg) |
| outg.backward() |
| |
| assert_almost_equal(out, outg) |
| for i, ig in zip(inputs, inputsg): |
| assert_almost_equal(i.grad, ig.grad) |
| |
| for p1, p2 in zip(net.collect_params().values(), netg.collect_params().values()): |
| assert_almost_equal(p1.data(), p2.data()) |
| if p1.grad_req != 'null': |
| assert_almost_equal(p1.grad(), p2.grad()) |
| mx.npx.waitall() |