test/python/test_model.py - singa - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 # =============================================================================

 from __future__ import division

 import os
 import math
 import unittest
 import numpy as np

 from singa import singa_wrap as singa_api
 from singa.tensor import Tensor
 from singa import autograd
 from singa import tensor
 from singa import device
 from singa import layer
 from singa import model
 from singa import opt

 from cuda_helper import gpu_dev, cpu_dev


 class DoubleLinear(layer.Layer):

     def __init__(self, a, b, c):
         super(DoubleLinear, self).__init__()
         self.l1 = layer.Linear(a, b)
         self.l2 = layer.Linear(b, c)

     def forward(self, x):
         y = self.l1(x)
         y = self.l2(y)
         return y


 class MyModel(model.Model):

     def __init__(self):
         super(MyModel, self).__init__()
         self.conv1 = layer.Conv2d(2, 2)
         self.bn1 = layer.BatchNorm2d(2)
         self.doublelinear1 = DoubleLinear(2, 4, 2)
         self.optimizer = opt.SGD()

     def forward(self, x):
         y = self.conv1(x)
         y = self.bn1(y)
         y = autograd.reshape(y, (y.shape[0], -1))
         y = self.doublelinear1(y)
         return y

     def train_one_batch(self, x, y):
         y_ = self.forward(x)
         l = self.loss(y_, y)
         self.optim(l)
         return y_, l

     def loss(self, out, ty):
         return autograd.softmax_cross_entropy(out, ty)

     def optim(self, loss):
         self.optimizer(loss)


 class MLP(model.Model):

     def __init__(self, data_size=10, perceptron_size=100, num_classes=10):
         super(MLP, self).__init__()
         self.num_classes = num_classes
         self.dimension = 2

         self.relu = layer.ReLU()
         self.linear1 = layer.Linear(perceptron_size)
         self.linear2 = layer.Linear(num_classes)
         self.softmax_cross_entropy = layer.SoftMaxCrossEntropy()

     def forward(self, inputs):
         y = self.linear1(inputs)
         y = self.relu(y)
         y = self.linear2(y)
         return y

     def train_one_batch(self, x, y):
         out = self.forward(x)
         loss = self.softmax_cross_entropy(out, y)
         self.optimizer(loss)
         return out, loss

     def set_optimizer(self, optimizer):
         self.optimizer = optimizer


 # lstm testing
 class LSTMModel3(model.Model):

     def __init__(self, hidden_size):
         super(LSTMModel3, self).__init__()
         self.lstm = layer.CudnnRNN(
             hidden_size=hidden_size,
             batch_first=True,
             #    return_sequences=True,
             use_mask=True)
         self.l1 = layer.Linear(2)
         self.optimizer = opt.SGD(0.1)

     def forward(self, x, seq_lengths):
         y = self.lstm(x, seq_lengths=seq_lengths)
         y = autograd.reshape(y, (y.shape[0], -1))
         y = self.l1(y)
         return y


 class LSTMModel2(model.Model):

     def __init__(self, hidden_size, bidirectional, num_layers):
         super(LSTMModel2, self).__init__()
         self.lstm = layer.CudnnRNN(hidden_size=hidden_size,
                                    num_layers=num_layers,
                                    bidirectional=bidirectional,
                                    return_sequences=False,
                                    rnn_mode='lstm',
                                    batch_first=True)
         self.optimizer = opt.SGD(0.1)

     def forward(self, x):
         return self.lstm(x)


 class LSTMModel(model.Model):

     def __init__(self, hidden_size, seq_length, batch_size, bidirectional,
                  num_layers, return_sequences, rnn_mode, batch_first):
         super(LSTMModel, self).__init__()
         self.hidden_size = hidden_size
         self.seq_length = seq_length
         self.return_sequences = return_sequences

         self.lstm = layer.CudnnRNN(hidden_size=hidden_size,
                                    num_layers=num_layers,
                                    bidirectional=bidirectional,
                                    return_sequences=return_sequences,
                                    rnn_mode=rnn_mode,
                                    batch_first=batch_first)
         self.optimizer = opt.SGD(0.1)

     def forward(self, x):
         y = self.lstm(x)
         if self.return_sequences:
             y = autograd.reshape(y, (-1, self.seq_length * self.hidden_size))
         return y


 class TestModelMethods(unittest.TestCase):

     @unittest.skipIf(not singa_api.USE_CUDA, 'CUDA is not enabled')
     def test_rnn_with_seq_lengths(self, dev=gpu_dev):
         bs = 2
         seq_length = 3
         hidden_size = 2
         em_size = 2
         x_np = np.array([[[0.1, 0.1], [0.2, 0.2], [0.3, 0.3]],
                          [[0.3, 0.3], [0.4, 0.4], [0.0,
                                                    0.0]]]).astype(np.float32)
         y_np = np.array([[0.4, 0.4], [0.5, 0.5]]).astype(np.float32)
         seq_lengths_np = np.array([3, 2]).astype(np.int32)

         x = tensor.from_numpy(x_np)
         x.to_device(dev)
         y = tensor.from_numpy(y_np)
         y.to_device(dev)
         seq_lengths = tensor.from_numpy(seq_lengths_np)

         m = LSTMModel3(hidden_size)
         m.compile([x, seq_lengths],
                   is_train=True,
                   use_graph=False,
                   sequential=False)
         m.train()
         for i in range(10):
             out = m.forward(x, seq_lengths)
             loss = autograd.mse_loss(out, y)
             print("train l:", tensor.to_numpy(loss))
             m.optimizer(loss)
         m.eval()
         out = m.forward(x, seq_lengths)
         loss = autograd.mse_loss(out, y)
         print(" eval l:", tensor.to_numpy(loss))

     @unittest.skipIf(not singa_api.USE_CUDA, 'CUDA is not enabled')
     def test_lstm_model(self, dev=gpu_dev):
         hidden_size = 3
         seq_length = 2
         batch_size = 4
         feature_size = 3
         bidirectional = False
         directions = 2 if bidirectional else 1
         num_layers = 2
         out_size = hidden_size
         return_sequences = False
         batch_first = True
         rnn_mode = "lstm"

         # manual test case
         x_data = np.array([[[0, 0, 1], [0, 1, 0]], [[0, 1, 0], [1, 0, 0]],
                            [[0, 0, 1], [0, 1, 0]], [[1, 0, 0], [0, 0, 1]]],
                           dtype=np.float32).reshape(batch_size, seq_length,
                                                     hidden_size)  # bs, seq, fea
         if return_sequences:
             y_data = np.array(
                 [[[0, 1, 0], [1, 0, 0]], [[1, 0, 0], [0, 0, 1]],
                  [[0, 1, 0], [1, 0, 0]], [[0, 0, 1], [0, 1, 0]]],
                 dtype=np.float32).reshape(batch_size, seq_length,
                                           hidden_size)  # bs, hidden
             y_data.reshape(batch_size, -1)
         else:
             y_data = np.array([[1, 0, 0], [0, 0, 1], [1, 0, 0], [0, 1, 0]],
                               dtype=np.float32).reshape(
                                   batch_size, hidden_size)  # bs, hidden

         x = tensor.Tensor(device=dev, data=x_data)
         y_t = tensor.Tensor(device=dev, data=y_data)

         m = LSTMModel(hidden_size, seq_length, batch_size, bidirectional,
                       num_layers, return_sequences, rnn_mode, batch_first)
         m.compile([x], is_train=True, use_graph=False, sequential=False)

         m.train()
         for i in range(1000):
             y = m.forward(x)
             assert y.shape == y_t.shape
             loss = autograd.softmax_cross_entropy(y, y_t)
             if i % 100 == 0:
                 print("loss", loss)
             m.optimizer(loss)

         m.eval()
         y = m.forward(x)
         loss = autograd.softmax_cross_entropy(y, y_t)
         print("eval loss", loss)


 class TestModelSaveMethods(unittest.TestCase):

     def _save_states_load_states_helper(self, dev, graph_flag="False"):
         x_shape = (2, 2, 2, 2)
         x = tensor.PlaceHolder(x_shape, device=dev)

         m = MyModel()
         m.compile([x], is_train=True, use_graph=graph_flag, sequential=False)

         states = {
             "conv1.W":
                 tensor.Tensor((2, 2, 2, 2), device=dev).set_value(0.1),
             "conv1.b":
                 tensor.Tensor((2,), device=dev).set_value(0.2),
             "bn1.scale":
                 tensor.Tensor((2,), device=dev).set_value(0.3),
             "bn1.bias":
                 tensor.Tensor((2,), device=dev).set_value(0.4),
             "bn1.running_mean":
                 tensor.Tensor((2,), device=dev).set_value(0.5),
             "bn1.running_var":
                 tensor.Tensor((2,), device=dev).set_value(0.6),
             "doublelinear1.l1.W":
                 tensor.Tensor((2, 4), device=dev).set_value(0.7),
             "doublelinear1.l1.b":
                 tensor.Tensor((4,), device=dev).set_value(0.8),
             "doublelinear1.l2.W":
                 tensor.Tensor((4, 2), device=dev).set_value(0.9),
             "doublelinear1.l2.b":
                 tensor.Tensor((2,), device=dev).set_value(1.0)
         }

         m.set_states(states)
         states2 = m.get_states()
         for k in states2.keys():
             np.testing.assert_array_almost_equal(tensor.to_numpy(states[k]),
                                                  tensor.to_numpy(states2[k]))

         opt_state1 = tensor.Tensor((2, 10), device=dev).gaussian(1, 0.1)
         opt_state2 = tensor.Tensor((20, 2), device=dev).gaussian(0.1, 1)
         aux = {"opt1": opt_state1, "opt2": opt_state2}

         # save snapshot1
         zip_fp = 'snapshot1_%s.zip' % self._testMethodName
         if os.path.exists(zip_fp):
             os.remove(zip_fp)
         m.save_states(zip_fp, aux)

         # do some training, states changes
         cx = tensor.Tensor(x_shape, device=dev).gaussian(1, 1)
         cy = tensor.Tensor((2, 2), device=dev).gaussian(1, 1)
         mini_batch_size = 10
         for i in range(mini_batch_size):
             m.train_one_batch(cx, cy)

         # restore snapshot
         aux2 = m.load_states(zip_fp)
         np.testing.assert_array_almost_equal(tensor.to_numpy(aux2["opt1"]),
                                              tensor.to_numpy(aux["opt1"]))
         np.testing.assert_array_almost_equal(tensor.to_numpy(aux2["opt2"]),
                                              tensor.to_numpy(aux["opt2"]))

         # snapshot states
         states3 = m.get_states()
         for k in states3.keys():
             np.testing.assert_array_almost_equal(tensor.to_numpy(states[k]),
                                                  tensor.to_numpy(states3[k]))

     @unittest.skipIf(not singa_api.USE_CUDA, 'CUDA is not enabled')
     def test_save_states_load_states_gpu(self):
         self._save_states_load_states_helper(gpu_dev, graph_flag=False)
         self._save_states_load_states_helper(gpu_dev, graph_flag=True)

     def test_save_states_load_states_cpu(self):
         self._save_states_load_states_helper(cpu_dev, graph_flag=False)
         self._save_states_load_states_helper(cpu_dev, graph_flag=True)


 class TestPythonModule(unittest.TestCase):

     def to_categorical(self, y, num_classes):
         y = np.array(y, dtype="int")
         n = y.shape[0]
         categorical = np.zeros((n, num_classes))
         categorical[np.arange(n), y] = 1
         return categorical

     def generate_data(self, dev, num=400):
         f = lambda x: (5 * x + 1)

         x = np.random.uniform(-1, 1, num)
         y = f(x) + 2 * np.random.randn(len(x))

         self.label = np.asarray([5 * a + 1 > b for (a, b) in zip(x, y)])
         self.data = np.array([[a, b] for (a, b) in zip(x, y)], dtype=np.float32)
         self.label = self.to_categorical(self.label, 2).astype(np.float32)

         self.inputs = Tensor(data=self.data, device=dev)
         self.target = Tensor(data=self.label, device=dev)

     def get_params(self, model):
         params = model.get_params()
         self.w0 = params['linear1.W']
         self.b0 = params['linear1.b']
         self.w1 = params['linear2.W']
         self.b1 = params['linear2.b']

         self.W0 = tensor.to_numpy(self.w0)
         self.B0 = tensor.to_numpy(self.b0)
         self.W1 = tensor.to_numpy(self.w1)
         self.B1 = tensor.to_numpy(self.b1)

     def numpy_forward(self, inputs):
         self.x1 = np.matmul(inputs, self.W0)
         self.x2 = np.add(self.x1, self.B0)
         self.x3 = np.maximum(self.x2, 0)
         self.x4 = np.matmul(self.x3, self.W1)
         self.x5 = np.add(self.x4, self.B1)
         return self.x5

     def numpy_train_one_batch(self, inputs, y):
         # forward propagation
         out = self.numpy_forward(inputs)

         # softmax cross entropy loss
         exp_out = np.exp(out - np.max(out, axis=-1, keepdims=True))
         self.softmax = exp_out / np.sum(exp_out, axis=-1, keepdims=True)
         loss = np.sum(y * np.log(self.softmax)) / -self.softmax.shape[0]

         # optimize
         # calculate gradients
         label_sum = np.sum(self.label, axis=-1)
         dloss = self.softmax - self.label / label_sum.reshape(
             label_sum.shape[0], 1)
         dloss /= self.softmax.shape[0]

         dx5 = dloss
         db1 = np.sum(dloss, 0)

         dx4 = np.matmul(dx5, self.W1.T)
         dw1 = np.matmul(self.x3.T, dx5)

         dx3 = dx4 * (self.x3 > 0)

         dx2 = dx3
         db0 = np.sum(dx3, 0)

         dx1 = np.matmul(dx2, self.W0.T)
         dw0 = np.matmul(self.data.T, dx2)

         # update all the params
         self.W0 -= 0.05 * dw0
         self.B0 -= 0.05 * db0
         self.W1 -= 0.05 * dw1
         self.B1 -= 0.05 * db1
         return out, loss

     def setUp(self):
         self.sgd = opt.SGD(lr=0.05)

         cpu_dev.ResetGraph()
         if singa_api.USE_CUDA:
             gpu_dev.ResetGraph()

     def tearDown(self):
         cpu_dev.ResetGraph()
         if singa_api.USE_CUDA:
             gpu_dev.ResetGraph()

     def _forward_helper(self, dev, is_train, use_graph, sequential):
         self.generate_data(dev)
         model = MLP(self.sgd)
         model.compile([self.inputs],
                       is_train=is_train,
                       use_graph=use_graph,
                       sequential=sequential)

         self.get_params(model)

         out = model(self.inputs)
         np_out = self.numpy_forward(self.data)

         np.testing.assert_array_almost_equal(tensor.to_numpy(out), np_out)

     def _train_one_batch_helper(self, dev, is_train, use_graph, sequential):
         self.generate_data(dev)
         model = MLP(num_classes=2)
         model.set_optimizer(self.sgd)
         model.compile([self.inputs],
                       is_train=is_train,
                       use_graph=use_graph,
                       sequential=sequential)

         self.get_params(model)

         out, loss = model(self.inputs, self.target)
         np_out, np_loss = self.numpy_train_one_batch(self.data, self.label)

         np.testing.assert_array_almost_equal(tensor.to_numpy(out), np_out)
         np.testing.assert_array_almost_equal(tensor.to_numpy(loss), np_loss)
         np.testing.assert_array_almost_equal(tensor.to_numpy(self.w0), self.W0)
         np.testing.assert_array_almost_equal(tensor.to_numpy(self.b0), self.B0)
         np.testing.assert_array_almost_equal(tensor.to_numpy(self.w1), self.W1)
         np.testing.assert_array_almost_equal(tensor.to_numpy(self.b1), self.B1)

     def test_forward_cpu(self):
         self._forward_helper(cpu_dev, False, True, False)

     @unittest.skipIf(not singa_api.USE_CUDA, 'CUDA is not enabled')
     def test_forward_gpu(self):
         self._forward_helper(gpu_dev, False, True, False)

     def test_evaluate_cpu(self):
         self._forward_helper(cpu_dev, False, False, False)

     @unittest.skipIf(not singa_api.USE_CUDA, 'CUDA is not enabled')
     def test_evaluate_gpu(self):
         self._forward_helper(gpu_dev, False, False, False)

     def test_train_one_batch_cpu(self):
         self._train_one_batch_helper(cpu_dev, True, True, False)

     @unittest.skipIf(not singa_api.USE_CUDA, 'CUDA is not enabled')
     def test_train_one_batch_gpu(self):
         self._train_one_batch_helper(gpu_dev, True, True, False)

     def test_without_graph_cpu(self):
         self._train_one_batch_helper(cpu_dev, True, False, False)

     @unittest.skipIf(not singa_api.USE_CUDA, 'CUDA is not enabled')
     def test_without_graph_gpu(self):
         self._train_one_batch_helper(gpu_dev, True, False, False)

     def test_run_in_serial_cpu(self):
         self._train_one_batch_helper(cpu_dev, True, True, True)

     @unittest.skipIf(not singa_api.USE_CUDA, 'CUDA is not enabled')
     def test_run_in_serial_gpu(self):
         self._train_one_batch_helper(gpu_dev, True, True, True)


 if __name__ == '__main__':
     unittest.main()
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	# =============================================================================

	from __future__ import division

	import os
	import math
	import unittest
	import numpy as np

	from singa import singa_wrap as singa_api
	from singa.tensor import Tensor
	from singa import autograd
	from singa import tensor
	from singa import device
	from singa import layer
	from singa import model
	from singa import opt

	from cuda_helper import gpu_dev, cpu_dev


	class DoubleLinear(layer.Layer):

	def __init__(self, a, b, c):
	super(DoubleLinear, self).__init__()
	self.l1 = layer.Linear(a, b)
	self.l2 = layer.Linear(b, c)

	def forward(self, x):
	y = self.l1(x)
	y = self.l2(y)
	return y


	class MyModel(model.Model):

	def __init__(self):
	super(MyModel, self).__init__()
	self.conv1 = layer.Conv2d(2, 2)
	self.bn1 = layer.BatchNorm2d(2)
	self.doublelinear1 = DoubleLinear(2, 4, 2)
	self.optimizer = opt.SGD()

	def forward(self, x):
	y = self.conv1(x)
	y = self.bn1(y)
	y = autograd.reshape(y, (y.shape[0], -1))
	y = self.doublelinear1(y)
	return y

	def train_one_batch(self, x, y):
	y_ = self.forward(x)
	l = self.loss(y_, y)
	self.optim(l)
	return y_, l

	def loss(self, out, ty):
	return autograd.softmax_cross_entropy(out, ty)

	def optim(self, loss):
	self.optimizer(loss)


	class MLP(model.Model):

	def __init__(self, data_size=10, perceptron_size=100, num_classes=10):
	super(MLP, self).__init__()
	self.num_classes = num_classes
	self.dimension = 2

	self.relu = layer.ReLU()
	self.linear1 = layer.Linear(perceptron_size)
	self.linear2 = layer.Linear(num_classes)
	self.softmax_cross_entropy = layer.SoftMaxCrossEntropy()

	def forward(self, inputs):
	y = self.linear1(inputs)
	y = self.relu(y)
	y = self.linear2(y)
	return y

	def train_one_batch(self, x, y):
	out = self.forward(x)
	loss = self.softmax_cross_entropy(out, y)
	self.optimizer(loss)
	return out, loss

	def set_optimizer(self, optimizer):
	self.optimizer = optimizer


	# lstm testing
	class LSTMModel3(model.Model):

	def __init__(self, hidden_size):
	super(LSTMModel3, self).__init__()
	self.lstm = layer.CudnnRNN(
	hidden_size=hidden_size,
	batch_first=True,
	# return_sequences=True,
	use_mask=True)
	self.l1 = layer.Linear(2)
	self.optimizer = opt.SGD(0.1)

	def forward(self, x, seq_lengths):
	y = self.lstm(x, seq_lengths=seq_lengths)
	y = autograd.reshape(y, (y.shape[0], -1))
	y = self.l1(y)
	return y


	class LSTMModel2(model.Model):

	def __init__(self, hidden_size, bidirectional, num_layers):
	super(LSTMModel2, self).__init__()
	self.lstm = layer.CudnnRNN(hidden_size=hidden_size,
	num_layers=num_layers,
	bidirectional=bidirectional,
	return_sequences=False,
	rnn_mode='lstm',
	batch_first=True)
	self.optimizer = opt.SGD(0.1)

	def forward(self, x):
	return self.lstm(x)


	class LSTMModel(model.Model):

	def __init__(self, hidden_size, seq_length, batch_size, bidirectional,
	num_layers, return_sequences, rnn_mode, batch_first):
	super(LSTMModel, self).__init__()
	self.hidden_size = hidden_size
	self.seq_length = seq_length
	self.return_sequences = return_sequences

	self.lstm = layer.CudnnRNN(hidden_size=hidden_size,
	num_layers=num_layers,
	bidirectional=bidirectional,
	return_sequences=return_sequences,
	rnn_mode=rnn_mode,
	batch_first=batch_first)
	self.optimizer = opt.SGD(0.1)

	def forward(self, x):
	y = self.lstm(x)
	if self.return_sequences:
	y = autograd.reshape(y, (-1, self.seq_length * self.hidden_size))
	return y


	class TestModelMethods(unittest.TestCase):

	@unittest.skipIf(not singa_api.USE_CUDA, 'CUDA is not enabled')
	def test_rnn_with_seq_lengths(self, dev=gpu_dev):
	bs = 2
	seq_length = 3
	hidden_size = 2
	em_size = 2
	x_np = np.array([[[0.1, 0.1], [0.2, 0.2], [0.3, 0.3]],
	[[0.3, 0.3], [0.4, 0.4], [0.0,
	0.0]]]).astype(np.float32)
	y_np = np.array([[0.4, 0.4], [0.5, 0.5]]).astype(np.float32)
	seq_lengths_np = np.array([3, 2]).astype(np.int32)

	x = tensor.from_numpy(x_np)
	x.to_device(dev)
	y = tensor.from_numpy(y_np)
	y.to_device(dev)
	seq_lengths = tensor.from_numpy(seq_lengths_np)

	m = LSTMModel3(hidden_size)
	m.compile([x, seq_lengths],
	is_train=True,
	use_graph=False,
	sequential=False)
	m.train()
	for i in range(10):
	out = m.forward(x, seq_lengths)
	loss = autograd.mse_loss(out, y)
	print("train l:", tensor.to_numpy(loss))
	m.optimizer(loss)
	m.eval()
	out = m.forward(x, seq_lengths)
	loss = autograd.mse_loss(out, y)
	print(" eval l:", tensor.to_numpy(loss))

	@unittest.skipIf(not singa_api.USE_CUDA, 'CUDA is not enabled')
	def test_lstm_model(self, dev=gpu_dev):
	hidden_size = 3
	seq_length = 2
	batch_size = 4
	feature_size = 3
	bidirectional = False
	directions = 2 if bidirectional else 1
	num_layers = 2
	out_size = hidden_size
	return_sequences = False
	batch_first = True
	rnn_mode = "lstm"

	# manual test case
	x_data = np.array([[[0, 0, 1], [0, 1, 0]], [[0, 1, 0], [1, 0, 0]],
	[[0, 0, 1], [0, 1, 0]], [[1, 0, 0], [0, 0, 1]]],
	dtype=np.float32).reshape(batch_size, seq_length,
	hidden_size) # bs, seq, fea
	if return_sequences:
	y_data = np.array(
	[[[0, 1, 0], [1, 0, 0]], [[1, 0, 0], [0, 0, 1]],
	[[0, 1, 0], [1, 0, 0]], [[0, 0, 1], [0, 1, 0]]],
	dtype=np.float32).reshape(batch_size, seq_length,
	hidden_size) # bs, hidden
	y_data.reshape(batch_size, -1)
	else:
	y_data = np.array([[1, 0, 0], [0, 0, 1], [1, 0, 0], [0, 1, 0]],
	dtype=np.float32).reshape(
	batch_size, hidden_size) # bs, hidden

	x = tensor.Tensor(device=dev, data=x_data)
	y_t = tensor.Tensor(device=dev, data=y_data)

	m = LSTMModel(hidden_size, seq_length, batch_size, bidirectional,
	num_layers, return_sequences, rnn_mode, batch_first)
	m.compile([x], is_train=True, use_graph=False, sequential=False)

	m.train()
	for i in range(1000):
	y = m.forward(x)
	assert y.shape == y_t.shape
	loss = autograd.softmax_cross_entropy(y, y_t)
	if i % 100 == 0:
	print("loss", loss)
	m.optimizer(loss)

	m.eval()
	y = m.forward(x)
	loss = autograd.softmax_cross_entropy(y, y_t)
	print("eval loss", loss)


	class TestModelSaveMethods(unittest.TestCase):

	def _save_states_load_states_helper(self, dev, graph_flag="False"):
	x_shape = (2, 2, 2, 2)
	x = tensor.PlaceHolder(x_shape, device=dev)

	m = MyModel()
	m.compile([x], is_train=True, use_graph=graph_flag, sequential=False)

	states = {
	"conv1.W":
	tensor.Tensor((2, 2, 2, 2), device=dev).set_value(0.1),
	"conv1.b":
	tensor.Tensor((2,), device=dev).set_value(0.2),
	"bn1.scale":
	tensor.Tensor((2,), device=dev).set_value(0.3),
	"bn1.bias":
	tensor.Tensor((2,), device=dev).set_value(0.4),
	"bn1.running_mean":
	tensor.Tensor((2,), device=dev).set_value(0.5),
	"bn1.running_var":
	tensor.Tensor((2,), device=dev).set_value(0.6),
	"doublelinear1.l1.W":
	tensor.Tensor((2, 4), device=dev).set_value(0.7),
	"doublelinear1.l1.b":
	tensor.Tensor((4,), device=dev).set_value(0.8),
	"doublelinear1.l2.W":
	tensor.Tensor((4, 2), device=dev).set_value(0.9),
	"doublelinear1.l2.b":
	tensor.Tensor((2,), device=dev).set_value(1.0)
	}

	m.set_states(states)
	states2 = m.get_states()
	for k in states2.keys():
	np.testing.assert_array_almost_equal(tensor.to_numpy(states[k]),
	tensor.to_numpy(states2[k]))

	opt_state1 = tensor.Tensor((2, 10), device=dev).gaussian(1, 0.1)
	opt_state2 = tensor.Tensor((20, 2), device=dev).gaussian(0.1, 1)
	aux = {"opt1": opt_state1, "opt2": opt_state2}

	# save snapshot1
	zip_fp = 'snapshot1_%s.zip' % self._testMethodName
	if os.path.exists(zip_fp):
	os.remove(zip_fp)
	m.save_states(zip_fp, aux)

	# do some training, states changes
	cx = tensor.Tensor(x_shape, device=dev).gaussian(1, 1)
	cy = tensor.Tensor((2, 2), device=dev).gaussian(1, 1)
	mini_batch_size = 10
	for i in range(mini_batch_size):
	m.train_one_batch(cx, cy)

	# restore snapshot
	aux2 = m.load_states(zip_fp)
	np.testing.assert_array_almost_equal(tensor.to_numpy(aux2["opt1"]),
	tensor.to_numpy(aux["opt1"]))
	np.testing.assert_array_almost_equal(tensor.to_numpy(aux2["opt2"]),
	tensor.to_numpy(aux["opt2"]))

	# snapshot states
	states3 = m.get_states()
	for k in states3.keys():
	np.testing.assert_array_almost_equal(tensor.to_numpy(states[k]),
	tensor.to_numpy(states3[k]))

	@unittest.skipIf(not singa_api.USE_CUDA, 'CUDA is not enabled')
	def test_save_states_load_states_gpu(self):
	self._save_states_load_states_helper(gpu_dev, graph_flag=False)
	self._save_states_load_states_helper(gpu_dev, graph_flag=True)

	def test_save_states_load_states_cpu(self):
	self._save_states_load_states_helper(cpu_dev, graph_flag=False)
	self._save_states_load_states_helper(cpu_dev, graph_flag=True)


	class TestPythonModule(unittest.TestCase):

	def to_categorical(self, y, num_classes):
	y = np.array(y, dtype="int")
	n = y.shape[0]
	categorical = np.zeros((n, num_classes))
	categorical[np.arange(n), y] = 1
	return categorical

	def generate_data(self, dev, num=400):
	f = lambda x: (5 * x + 1)

	x = np.random.uniform(-1, 1, num)
	y = f(x) + 2 * np.random.randn(len(x))

	self.label = np.asarray([5 * a + 1 > b for (a, b) in zip(x, y)])
	self.data = np.array([[a, b] for (a, b) in zip(x, y)], dtype=np.float32)
	self.label = self.to_categorical(self.label, 2).astype(np.float32)

	self.inputs = Tensor(data=self.data, device=dev)
	self.target = Tensor(data=self.label, device=dev)

	def get_params(self, model):
	params = model.get_params()
	self.w0 = params['linear1.W']
	self.b0 = params['linear1.b']
	self.w1 = params['linear2.W']
	self.b1 = params['linear2.b']

	self.W0 = tensor.to_numpy(self.w0)
	self.B0 = tensor.to_numpy(self.b0)
	self.W1 = tensor.to_numpy(self.w1)
	self.B1 = tensor.to_numpy(self.b1)

	def numpy_forward(self, inputs):
	self.x1 = np.matmul(inputs, self.W0)
	self.x2 = np.add(self.x1, self.B0)
	self.x3 = np.maximum(self.x2, 0)
	self.x4 = np.matmul(self.x3, self.W1)
	self.x5 = np.add(self.x4, self.B1)
	return self.x5

	def numpy_train_one_batch(self, inputs, y):
	# forward propagation
	out = self.numpy_forward(inputs)

	# softmax cross entropy loss
	exp_out = np.exp(out - np.max(out, axis=-1, keepdims=True))
	self.softmax = exp_out / np.sum(exp_out, axis=-1, keepdims=True)
	loss = np.sum(y * np.log(self.softmax)) / -self.softmax.shape[0]

	# optimize
	# calculate gradients
	label_sum = np.sum(self.label, axis=-1)
	dloss = self.softmax - self.label / label_sum.reshape(
	label_sum.shape[0], 1)
	dloss /= self.softmax.shape[0]

	dx5 = dloss
	db1 = np.sum(dloss, 0)

	dx4 = np.matmul(dx5, self.W1.T)
	dw1 = np.matmul(self.x3.T, dx5)

	dx3 = dx4 * (self.x3 > 0)

	dx2 = dx3
	db0 = np.sum(dx3, 0)

	dx1 = np.matmul(dx2, self.W0.T)
	dw0 = np.matmul(self.data.T, dx2)

	# update all the params
	self.W0 -= 0.05 * dw0
	self.B0 -= 0.05 * db0
	self.W1 -= 0.05 * dw1
	self.B1 -= 0.05 * db1
	return out, loss

	def setUp(self):
	self.sgd = opt.SGD(lr=0.05)

	cpu_dev.ResetGraph()
	if singa_api.USE_CUDA:
	gpu_dev.ResetGraph()

	def tearDown(self):
	cpu_dev.ResetGraph()
	if singa_api.USE_CUDA:
	gpu_dev.ResetGraph()

	def _forward_helper(self, dev, is_train, use_graph, sequential):
	self.generate_data(dev)
	model = MLP(self.sgd)
	model.compile([self.inputs],
	is_train=is_train,
	use_graph=use_graph,
	sequential=sequential)

	self.get_params(model)

	out = model(self.inputs)
	np_out = self.numpy_forward(self.data)

	np.testing.assert_array_almost_equal(tensor.to_numpy(out), np_out)

	def _train_one_batch_helper(self, dev, is_train, use_graph, sequential):
	self.generate_data(dev)
	model = MLP(num_classes=2)
	model.set_optimizer(self.sgd)
	model.compile([self.inputs],
	is_train=is_train,
	use_graph=use_graph,
	sequential=sequential)

	self.get_params(model)

	out, loss = model(self.inputs, self.target)
	np_out, np_loss = self.numpy_train_one_batch(self.data, self.label)

	np.testing.assert_array_almost_equal(tensor.to_numpy(out), np_out)
	np.testing.assert_array_almost_equal(tensor.to_numpy(loss), np_loss)
	np.testing.assert_array_almost_equal(tensor.to_numpy(self.w0), self.W0)
	np.testing.assert_array_almost_equal(tensor.to_numpy(self.b0), self.B0)
	np.testing.assert_array_almost_equal(tensor.to_numpy(self.w1), self.W1)
	np.testing.assert_array_almost_equal(tensor.to_numpy(self.b1), self.B1)

	def test_forward_cpu(self):
	self._forward_helper(cpu_dev, False, True, False)

	@unittest.skipIf(not singa_api.USE_CUDA, 'CUDA is not enabled')
	def test_forward_gpu(self):
	self._forward_helper(gpu_dev, False, True, False)

	def test_evaluate_cpu(self):
	self._forward_helper(cpu_dev, False, False, False)

	@unittest.skipIf(not singa_api.USE_CUDA, 'CUDA is not enabled')
	def test_evaluate_gpu(self):
	self._forward_helper(gpu_dev, False, False, False)

	def test_train_one_batch_cpu(self):
	self._train_one_batch_helper(cpu_dev, True, True, False)

	@unittest.skipIf(not singa_api.USE_CUDA, 'CUDA is not enabled')
	def test_train_one_batch_gpu(self):
	self._train_one_batch_helper(gpu_dev, True, True, False)

	def test_without_graph_cpu(self):
	self._train_one_batch_helper(cpu_dev, True, False, False)

	@unittest.skipIf(not singa_api.USE_CUDA, 'CUDA is not enabled')
	def test_without_graph_gpu(self):
	self._train_one_batch_helper(gpu_dev, True, False, False)

	def test_run_in_serial_cpu(self):
	self._train_one_batch_helper(cpu_dev, True, True, True)

	@unittest.skipIf(not singa_api.USE_CUDA, 'CUDA is not enabled')
	def test_run_in_serial_gpu(self):
	self._train_one_batch_helper(gpu_dev, True, True, True)


	if __name__ == '__main__':
	unittest.main()