blob: 2d5874a8b97b1ed1ceefba6f5f240398e2dbfb2d [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import mxnet as mx
import unittest
import os
import numpy as np
from mxnet import gluon
from mxnet.gluon import nn
from mxnet.test_utils import assert_almost_equal
from common import setup_module, with_seed, assertRaises
from copy import deepcopy
from nose.tools import raises, assert_raises
@with_seed()
@raises(RuntimeError)
def test_multi_trainer():
x = gluon.Parameter('x', shape=(10,), stype='row_sparse')
x.initialize()
# test set trainer
trainer0 = gluon.Trainer([x], 'sgd')
assert(x._trainer is trainer0)
# test unset trainer
x._set_trainer(None)
assert(x._trainer is None)
x._set_trainer(trainer0)
# multiple trainers for a sparse Parameter is not allowed
trainer1 = gluon.Trainer([x], 'sgd')
@with_seed()
def test_trainer():
def dict_equ(a, b):
assert set(a) == set(b)
for k in a:
assert (a[k].asnumpy() == b[k].asnumpy()).all()
x = gluon.Parameter('x', shape=(10,))
x.initialize(ctx=[mx.cpu(0), mx.cpu(1)], init='zeros')
trainer = gluon.Trainer([x], 'sgd', {'learning_rate': 1.0, 'momentum': 0.5})
with mx.autograd.record():
for w in x.list_data():
y = w + 1
y.backward()
trainer.step(1)
assert trainer._optimizer.param_dict == trainer._optimizer.param_dict
assert (x.data(mx.cpu(1)).asnumpy() == -2).all()
x.lr_mult = 0.5
with mx.autograd.record():
for w in x.list_data():
y = w + 1
y.backward()
trainer.step(1)
assert (x.data(mx.cpu(1)).asnumpy() == -4).all()
trainer.save_states('test_trainer.states')
states = deepcopy(trainer._kvstore._updater.states) if trainer._update_on_kvstore \
else deepcopy(trainer._updaters[0].states)
trainer.load_states('test_trainer.states')
if trainer._update_on_kvstore:
dict_equ(trainer._kvstore._updater.states, states)
assert trainer._optimizer == trainer._kvstore._updater.optimizer
# invalid usage of update and allreduce_grads if update_on_kvstore
assert_raises(AssertionError, trainer.update, 1)
assert_raises(AssertionError, trainer.allreduce_grads)
else:
for updater in trainer._updaters:
dict_equ(updater.states, states)
assert trainer._optimizer == trainer._updaters[0].optimizer
x = gluon.Parameter('x', shape=(10,))
x.initialize(ctx=[mx.cpu(0), mx.cpu(1)], init='zeros')
trainer2 = gluon.Trainer([x], 'sgd', {'learning_rate': 1.0, 'momentum': 0.5},
update_on_kvstore=False)
with mx.autograd.record():
for i, w in enumerate(x.list_data()):
y = i*w
y.backward()
assert (x.grad(mx.cpu(0)).asnumpy() != x.grad(mx.cpu(1)).asnumpy()).all()
trainer2.allreduce_grads()
assert (x.grad(mx.cpu(0)).asnumpy() == x.grad(mx.cpu(1)).asnumpy()).all()
trainer2.update(1)
assert (x.data(mx.cpu(1)).asnumpy() == -1).all(), x.data(mx.cpu(1)).asnumpy()
@with_seed()
def test_trainer_save_load():
previous_update_on_kvstore = os.getenv('MXNET_UPDATE_ON_KVSTORE', "1")
os.putenv('MXNET_UPDATE_ON_KVSTORE', '1')
x = gluon.Parameter('x', shape=(10,), lr_mult=1.0)
x.initialize(ctx=[mx.cpu(0), mx.cpu(1)], init='zeros')
trainer = gluon.Trainer([x], 'sgd', {'learning_rate': 0.1})
with mx.autograd.record():
for w in x.list_data():
y = w + 1
y.backward()
trainer.step(1)
assert trainer._kvstore._updater.optimizer._get_lr(0) == 0.1
trainer.save_states('test_trainer_save_load.states')
trainer.load_states('test_trainer_save_load.states')
x.lr_mult = 2.0
# check if parameter dict is correctly associated with optimizer after load_state
assert trainer._kvstore._updater.optimizer._get_lr(0) == 0.2
os.putenv('MXNET_UPDATE_ON_KVSTORE', previous_update_on_kvstore)
@with_seed()
def test_trainer_sparse_save_load():
x = gluon.Parameter('x', shape=(10, 1), lr_mult=1.0, stype='row_sparse')
x.initialize(ctx=[mx.cpu(0)], init='zeros')
trainer = gluon.Trainer([x], 'sgd', {'learning_rate': 0.1})
all_rows = mx.nd.arange(0, 10, ctx=mx.cpu(0))
with mx.autograd.record():
for w in x.list_row_sparse_data(all_rows):
y = w * 1
y.backward()
trainer.step(1)
assert trainer._kvstore._updater.optimizer._get_lr(0) == 0.1
trainer.save_states('test_trainer_sparse_save_load.states')
trainer.load_states('test_trainer_sparse_save_load.states')
x.lr_mult = 2.0
# check if parameter dict is correctly associated with optimizer after load_state
assert trainer._kvstore._updater.optimizer._get_lr(0) == 0.2
@with_seed()
def test_trainer_multi_layer_init():
class Net(gluon.Block):
def __init__(self, **kwargs):
super(Net, self).__init__(**kwargs)
with self.name_scope():
# sparse param
self.embed_weight = self.params.get('embed_weight', stype='row_sparse',
shape=(4,3), grad_stype='row_sparse')
# dense param from a hybrid block
self.dense0 = nn.Dense(2)
def forward(self, x):
embed_weight = self.embed_weight.row_sparse_data(x)
embed = mx.nd.Embedding(data=x, weight=embed_weight,
input_dim=4, output_dim=3, sparse_grad=True)
return self.dense0(embed)
def check_init(ctxes):
net = Net(prefix='net_')
net.initialize(mx.init.One(), ctx=ctxes)
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 1})
data = mx.nd.array([[0,2], [1,2]])
xs = gluon.utils.split_and_load(data, ctxes)
ys = []
with mx.autograd.record():
for x in xs:
y = net(x)
ys.append(y)
for y in ys:
y.backward()
trainer.step(1)
# all parameters should be initialized
assert not trainer._params_to_init
all_rows = mx.nd.arange(0, 4, ctx=mx.cpu(1))
# check the updated weights
weight = net.embed_weight.row_sparse_data(all_rows).asnumpy()
assert (weight[0] == -1).all()
assert (weight[1] == -1).all()
assert (weight[2] == -3).all()
assert (weight[3] == 1).all()
check_init([mx.cpu(1), mx.cpu(2)])
check_init([mx.cpu(1)])
@with_seed()
def test_trainer_reset_kv():
def check_trainer_reset_kv(kv):
params = gluon.ParameterDict()
x = params.get('x', shape=(10,), lr_mult=1.0)
params.initialize(ctx=[mx.cpu(0), mx.cpu(1)], init='zeros')
trainer = gluon.Trainer(params, 'sgd', {'learning_rate': 0.1}, kvstore=kv)
params.save('test_trainer_reset_kv.params')
with mx.autograd.record():
for w in x.list_data():
y = w + 1
y.backward()
trainer.step(1)
assert trainer._kvstore.type == kv
# load would reset kvstore
mx.nd.waitall()
params.load('test_trainer_reset_kv.params')
if trainer._update_on_kvstore:
# drop kvstore state if new parameters are loaded
assert trainer._kvstore is None
assert trainer._kv_initialized is False
with mx.autograd.record():
for w in x.list_data():
y = w + 1
y.backward()
trainer.step(1)
# the updated parameter should be based on the loaded checkpoint
assert (x.data(mx.cpu()) == -0.2).asnumpy().all()
kvs = ['local', 'device']
for kv in kvs:
check_trainer_reset_kv(kv)
@with_seed()
def test_trainer_sparse_kv():
def check_trainer_sparse_kv(kv, stype, grad_stype, update_on_kv, expected):
params = gluon.ParameterDict()
x = params.get('x', shape=(10,1), lr_mult=1.0, stype=stype, grad_stype=grad_stype)
params.initialize(ctx=[mx.cpu(0), mx.cpu(1)], init='zeros')
trainer = gluon.Trainer(params, 'sgd', {'learning_rate': 0.1},
kvstore=kv, update_on_kvstore=update_on_kv)
all_rows = mx.nd.arange(0, 10, ctx=mx.cpu(0))
try:
ws = x.list_data() if stype == 'default' else x.list_row_sparse_data(all_rows)
with mx.autograd.record():
for w in ws:
y = w + 1
y.backward()
trainer.step(1)
assert trainer._kvstore.type == kv
assert trainer._kv_initialized
assert trainer._update_on_kvstore is expected
# the updated parameter should be based on the loaded checkpoint
mx.nd.waitall()
updated_w = x.data(mx.cpu(0)) if stype == 'default' else x.row_sparse_data(all_rows)
assert (updated_w == -0.2).asnumpy().all()
except Exception as err:
assert isinstance(err, expected)
kvs = ['local', 'device']
global_update_on_kvstore = bool(int(os.getenv('MXNET_UPDATE_ON_KVSTORE', "1")))
for kv in kvs:
check_trainer_sparse_kv(kv, 'default', 'default', True, True)
check_trainer_sparse_kv(kv, 'default', 'default', False, False)
check_trainer_sparse_kv(kv, 'default', 'default', None, global_update_on_kvstore)
check_trainer_sparse_kv(kv, 'default', 'row_sparse', None, False)
check_trainer_sparse_kv(kv, 'default', 'row_sparse', True, True)
check_trainer_sparse_kv(kv, 'default', 'row_sparse', False, False)
check_trainer_sparse_kv(kv, 'row_sparse', 'row_sparse', None, True)
check_trainer_sparse_kv(kv, 'row_sparse', 'row_sparse', False, ValueError)
@with_seed()
def test_trainer_lr_sched():
x = gluon.Parameter('x', shape=(10,))
x.initialize(ctx=[mx.cpu(0), mx.cpu(1)], init='zeros')
freq = 2
factor = 0.1
lr = 1
lr_sched = mx.lr_scheduler.FactorScheduler(freq, factor=factor, base_lr=lr)
trainer = gluon.Trainer([x], 'sgd', {'learning_rate': lr, 'lr_scheduler': lr_sched})
for i in range(10):
with mx.autograd.record():
for w in x.list_data():
y = w + 1
y.backward()
trainer.step(1)
if i % freq == 0:
assert trainer.learning_rate == lr, (lr, trainer.learning_rate, i)
lr *= factor
mx.nd.waitall()
# Update on kvstore = False
x = gluon.Parameter('x', shape=(10,))
x.initialize(ctx=[mx.cpu(0), mx.cpu(1)], init='zeros')
freq = 2
factor = 0.1
lr = 1
lr_sched = mx.lr_scheduler.FactorScheduler(freq, factor=factor, base_lr=lr)
trainer = gluon.Trainer([x], 'sgd', {'learning_rate': lr, 'lr_scheduler': lr_sched},
update_on_kvstore=False)
for i in range(10):
with mx.autograd.record():
for w in x.list_data():
y = w + 1
y.backward()
trainer.step(1)
if i % freq == 0:
assert trainer.learning_rate == lr, (lr, trainer.learning_rate, i)
lr *= factor
mx.nd.waitall()