example/rnn-time-major/rnn_cell_demo.py - mxnet-test - Git at Google

 """A simple demo of new RNN cell with PTB language model."""

 ################################################################################
 # Speed test (time major is 1.5~2 times faster than batch major).
 #
 # -- This script (time major) -----
 # 2016-10-10 18:43:21,890 Epoch[0] Batch [50]     Speed: 1717.76 samples/sec      Train-Perplexity=4311.345018
 # 2016-10-10 18:43:25,959 Epoch[0] Batch [100]    Speed: 1573.17 samples/sec      Train-Perplexity=844.092421
 # 2016-10-10 18:43:29,807 Epoch[0] Batch [150]    Speed: 1663.17 samples/sec      Train-Perplexity=498.080716
 # 2016-10-10 18:43:33,871 Epoch[0] Batch [200]    Speed: 1574.84 samples/sec      Train-Perplexity=455.051252
 # 2016-10-10 18:43:37,720 Epoch[0] Batch [250]    Speed: 1662.87 samples/sec      Train-Perplexity=410.500066
 # 2016-10-10 18:43:40,766 Epoch[0] Batch [300]    Speed: 2100.81 samples/sec      Train-Perplexity=274.317460
 # 2016-10-10 18:43:44,571 Epoch[0] Batch [350]    Speed: 1682.45 samples/sec      Train-Perplexity=350.132577
 # 2016-10-10 18:43:48,377 Epoch[0] Batch [400]    Speed: 1681.41 samples/sec      Train-Perplexity=320.674884
 # 2016-10-10 18:43:51,253 Epoch[0] Train-Perplexity=336.210212
 # 2016-10-10 18:43:51,253 Epoch[0] Time cost=33.529
 # 2016-10-10 18:43:53,373 Epoch[0] Validation-Perplexity=282.453883
 #
 # -- ../rnn/rnn_cell_demo.py (batch major) -----
 # 2016-10-10 18:44:34,133 Epoch[0] Batch [50]     Speed: 1004.50 samples/sec      Train-Perplexity=4398.428571
 # 2016-10-10 18:44:39,874 Epoch[0] Batch [100]    Speed: 1114.85 samples/sec      Train-Perplexity=771.401960
 # 2016-10-10 18:44:45,528 Epoch[0] Batch [150]    Speed: 1132.03 samples/sec      Train-Perplexity=525.207444
 # 2016-10-10 18:44:51,564 Epoch[0] Batch [200]    Speed: 1060.37 samples/sec      Train-Perplexity=453.741140
 # 2016-10-10 18:44:57,865 Epoch[0] Batch [250]    Speed: 1015.78 samples/sec      Train-Perplexity=411.914237
 # 2016-10-10 18:45:04,032 Epoch[0] Batch [300]    Speed: 1037.92 samples/sec      Train-Perplexity=381.302188
 # 2016-10-10 18:45:10,153 Epoch[0] Batch [350]    Speed: 1045.49 samples/sec      Train-Perplexity=363.326871
 # 2016-10-10 18:45:16,062 Epoch[0] Batch [400]    Speed: 1083.21 samples/sec      Train-Perplexity=377.929014
 # 2016-10-10 18:45:19,993 Epoch[0] Train-Perplexity=294.675899
 # 2016-10-10 18:45:19,993 Epoch[0] Time cost=52.604
 # 2016-10-10 18:45:21,401 Epoch[0] Validation-Perplexity=294.345659
 ################################################################################

 import os

 import numpy as np
 import mxnet as mx

 from bucket_io import BucketSentenceIter, default_build_vocab


 data_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data'))


 def Perplexity(label, pred):
     # collapse the time, batch dimension
     label = label.reshape((-1,))
     pred = pred.reshape((-1, pred.shape[-1]))

     loss = 0.
     for i in range(pred.shape[0]):
         loss += -np.log(max(1e-10, pred[i][int(label[i])]))
     return np.exp(loss / label.size)


 if __name__ == '__main__':
     batch_size = 128
     buckets = [10, 20, 30, 40, 50, 60]
     num_hidden = 200
     num_embed = 200
     num_lstm_layer = 2

     num_epoch = 2
     learning_rate = 0.01
     momentum = 0.0

     contexts = [mx.context.gpu(i) for i in range(1)]
     vocab = default_build_vocab(os.path.join(data_dir, 'ptb.train.txt'))

     init_h = [mx.io.DataDesc('LSTM_state', (num_lstm_layer, batch_size, num_hidden), layout='TNC')]
     init_c = [mx.io.DataDesc('LSTM_state_cell', (num_lstm_layer, batch_size, num_hidden), layout='TNC')]
     init_states = init_c + init_h

     data_train = BucketSentenceIter(os.path.join(data_dir, 'ptb.train.txt'),
                                     vocab, buckets, batch_size, init_states,
                                     time_major=True)
     data_val = BucketSentenceIter(os.path.join(data_dir, 'ptb.valid.txt'),
                                   vocab, buckets, batch_size, init_states,
                                   time_major=True)

     def sym_gen(seq_len):
         data = mx.sym.Variable('data')
         label = mx.sym.Variable('softmax_label')
         embed = mx.sym.Embedding(data=data, input_dim=len(vocab),
                                  output_dim=num_embed, name='embed')

         # TODO(tofix)
         # currently all the LSTM parameters are concatenated as
         # a huge vector, and named '<name>_parameters'. By default
         # mxnet initializer does not know how to initilize this
         # guy because its name does not ends with _weight or _bias
         # or anything familiar. Here we just use a temp workaround
         # to create a variable and name it as LSTM_bias to get
         # this demo running. Note by default bias is initialized
         # as zeros, so this is not a good scheme. But calling it
         # LSTM_weight is not good, as this is 1D vector, while
         # the initialization scheme of a weight parameter needs
         # at least two dimensions.
         rnn_params = mx.sym.Variable('LSTM_bias')

         # RNN cell takes input of shape (time, batch, feature)
         rnn = mx.sym.RNN(data=embed, state_size=num_hidden,
                          num_layers=num_lstm_layer, mode='lstm',
                          name='LSTM',
                          # The following params can be omitted
                          # provided we do not need to apply the
                          # workarounds mentioned above
                          parameters=rnn_params)

         # the RNN cell output is of shape (time, batch, dim)
         # if we need the states and cell states in the last time
         # step (e.g. when building encoder-decoder models), we
         # can set state_outputs=True, and the RNN cell will have
         # extra outputs: rnn['LSTM_output'], rnn['LSTM_state']
         # and for LSTM, also rnn['LSTM_state_cell']

         # now we collapse the time and batch dimension to do the
         # final linear logistic regression prediction
         hidden = mx.sym.Reshape(data=rnn, shape=(-1, num_hidden))

         pred = mx.sym.FullyConnected(data=hidden, num_hidden=len(vocab),
                                      name='pred')

         # reshape to be of compatible shape as labels
         pred_tm = mx.sym.Reshape(data=pred, shape=(seq_len, -1, len(vocab)))

         sm = mx.sym.SoftmaxOutput(data=pred_tm, label=label, preserve_shape=True,
                                   name='softmax')

         data_names = ['data', 'LSTM_state', 'LSTM_state_cell']
         label_names = ['softmax_label']

         return (sm, data_names, label_names)

     if len(buckets) == 1:
         mod = mx.mod.Module(*sym_gen(buckets[0]), context=contexts)
     else:
         mod = mx.mod.BucketingModule(sym_gen,
                                      default_bucket_key=data_train.default_bucket_key,
                                      context=contexts)

     import logging
     head = '%(asctime)-15s %(message)s'
     logging.basicConfig(level=logging.DEBUG, format=head)

     mod.fit(data_train, eval_data=data_val, num_epoch=num_epoch,
             eval_metric=mx.metric.np(Perplexity),
             batch_end_callback=mx.callback.Speedometer(batch_size, 50),
             initializer=mx.init.Xavier(factor_type="in", magnitude=2.34),
             optimizer='sgd',
             optimizer_params={'learning_rate': learning_rate,
                               'momentum': momentum, 'wd': 0.00001})
	"""A simple demo of new RNN cell with PTB language model."""

	################################################################################
	# Speed test (time major is 1.5~2 times faster than batch major).
	#
	# -- This script (time major) -----
	# 2016-10-10 18:43:21,890 Epoch[0] Batch [50] Speed: 1717.76 samples/sec Train-Perplexity=4311.345018
	# 2016-10-10 18:43:25,959 Epoch[0] Batch [100] Speed: 1573.17 samples/sec Train-Perplexity=844.092421
	# 2016-10-10 18:43:29,807 Epoch[0] Batch [150] Speed: 1663.17 samples/sec Train-Perplexity=498.080716
	# 2016-10-10 18:43:33,871 Epoch[0] Batch [200] Speed: 1574.84 samples/sec Train-Perplexity=455.051252
	# 2016-10-10 18:43:37,720 Epoch[0] Batch [250] Speed: 1662.87 samples/sec Train-Perplexity=410.500066
	# 2016-10-10 18:43:40,766 Epoch[0] Batch [300] Speed: 2100.81 samples/sec Train-Perplexity=274.317460
	# 2016-10-10 18:43:44,571 Epoch[0] Batch [350] Speed: 1682.45 samples/sec Train-Perplexity=350.132577
	# 2016-10-10 18:43:48,377 Epoch[0] Batch [400] Speed: 1681.41 samples/sec Train-Perplexity=320.674884
	# 2016-10-10 18:43:51,253 Epoch[0] Train-Perplexity=336.210212
	# 2016-10-10 18:43:51,253 Epoch[0] Time cost=33.529
	# 2016-10-10 18:43:53,373 Epoch[0] Validation-Perplexity=282.453883
	#
	# -- ../rnn/rnn_cell_demo.py (batch major) -----
	# 2016-10-10 18:44:34,133 Epoch[0] Batch [50] Speed: 1004.50 samples/sec Train-Perplexity=4398.428571
	# 2016-10-10 18:44:39,874 Epoch[0] Batch [100] Speed: 1114.85 samples/sec Train-Perplexity=771.401960
	# 2016-10-10 18:44:45,528 Epoch[0] Batch [150] Speed: 1132.03 samples/sec Train-Perplexity=525.207444
	# 2016-10-10 18:44:51,564 Epoch[0] Batch [200] Speed: 1060.37 samples/sec Train-Perplexity=453.741140
	# 2016-10-10 18:44:57,865 Epoch[0] Batch [250] Speed: 1015.78 samples/sec Train-Perplexity=411.914237
	# 2016-10-10 18:45:04,032 Epoch[0] Batch [300] Speed: 1037.92 samples/sec Train-Perplexity=381.302188
	# 2016-10-10 18:45:10,153 Epoch[0] Batch [350] Speed: 1045.49 samples/sec Train-Perplexity=363.326871
	# 2016-10-10 18:45:16,062 Epoch[0] Batch [400] Speed: 1083.21 samples/sec Train-Perplexity=377.929014
	# 2016-10-10 18:45:19,993 Epoch[0] Train-Perplexity=294.675899
	# 2016-10-10 18:45:19,993 Epoch[0] Time cost=52.604
	# 2016-10-10 18:45:21,401 Epoch[0] Validation-Perplexity=294.345659
	################################################################################

	import os

	import numpy as np
	import mxnet as mx

	from bucket_io import BucketSentenceIter, default_build_vocab


	data_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data'))


	def Perplexity(label, pred):
	# collapse the time, batch dimension
	label = label.reshape((-1,))
	pred = pred.reshape((-1, pred.shape[-1]))

	loss = 0.
	for i in range(pred.shape[0]):
	loss += -np.log(max(1e-10, pred[i][int(label[i])]))
	return np.exp(loss / label.size)


	if __name__ == '__main__':
	batch_size = 128
	buckets = [10, 20, 30, 40, 50, 60]
	num_hidden = 200
	num_embed = 200
	num_lstm_layer = 2

	num_epoch = 2
	learning_rate = 0.01
	momentum = 0.0

	contexts = [mx.context.gpu(i) for i in range(1)]
	vocab = default_build_vocab(os.path.join(data_dir, 'ptb.train.txt'))

	init_h = [mx.io.DataDesc('LSTM_state', (num_lstm_layer, batch_size, num_hidden), layout='TNC')]
	init_c = [mx.io.DataDesc('LSTM_state_cell', (num_lstm_layer, batch_size, num_hidden), layout='TNC')]
	init_states = init_c + init_h

	data_train = BucketSentenceIter(os.path.join(data_dir, 'ptb.train.txt'),
	vocab, buckets, batch_size, init_states,
	time_major=True)
	data_val = BucketSentenceIter(os.path.join(data_dir, 'ptb.valid.txt'),
	vocab, buckets, batch_size, init_states,
	time_major=True)

	def sym_gen(seq_len):
	data = mx.sym.Variable('data')
	label = mx.sym.Variable('softmax_label')
	embed = mx.sym.Embedding(data=data, input_dim=len(vocab),
	output_dim=num_embed, name='embed')

	# TODO(tofix)
	# currently all the LSTM parameters are concatenated as
	# a huge vector, and named '<name>_parameters'. By default
	# mxnet initializer does not know how to initilize this
	# guy because its name does not ends with _weight or _bias
	# or anything familiar. Here we just use a temp workaround
	# to create a variable and name it as LSTM_bias to get
	# this demo running. Note by default bias is initialized
	# as zeros, so this is not a good scheme. But calling it
	# LSTM_weight is not good, as this is 1D vector, while
	# the initialization scheme of a weight parameter needs
	# at least two dimensions.
	rnn_params = mx.sym.Variable('LSTM_bias')

	# RNN cell takes input of shape (time, batch, feature)
	rnn = mx.sym.RNN(data=embed, state_size=num_hidden,
	num_layers=num_lstm_layer, mode='lstm',
	name='LSTM',
	# The following params can be omitted
	# provided we do not need to apply the
	# workarounds mentioned above
	parameters=rnn_params)

	# the RNN cell output is of shape (time, batch, dim)
	# if we need the states and cell states in the last time
	# step (e.g. when building encoder-decoder models), we
	# can set state_outputs=True, and the RNN cell will have
	# extra outputs: rnn['LSTM_output'], rnn['LSTM_state']
	# and for LSTM, also rnn['LSTM_state_cell']

	# now we collapse the time and batch dimension to do the
	# final linear logistic regression prediction
	hidden = mx.sym.Reshape(data=rnn, shape=(-1, num_hidden))

	pred = mx.sym.FullyConnected(data=hidden, num_hidden=len(vocab),
	name='pred')

	# reshape to be of compatible shape as labels
	pred_tm = mx.sym.Reshape(data=pred, shape=(seq_len, -1, len(vocab)))

	sm = mx.sym.SoftmaxOutput(data=pred_tm, label=label, preserve_shape=True,
	name='softmax')

	data_names = ['data', 'LSTM_state', 'LSTM_state_cell']
	label_names = ['softmax_label']

	return (sm, data_names, label_names)

	if len(buckets) == 1:
	mod = mx.mod.Module(*sym_gen(buckets[0]), context=contexts)
	else:
	mod = mx.mod.BucketingModule(sym_gen,
	default_bucket_key=data_train.default_bucket_key,
	context=contexts)

	import logging
	head = '%(asctime)-15s %(message)s'
	logging.basicConfig(level=logging.DEBUG, format=head)

	mod.fit(data_train, eval_data=data_val, num_epoch=num_epoch,
	eval_metric=mx.metric.np(Perplexity),
	batch_end_callback=mx.callback.Speedometer(batch_size, 50),
	initializer=mx.init.Xavier(factor_type="in", magnitude=2.34),
	optimizer='sgd',
	optimizer_params={'learning_rate': learning_rate,
	'momentum': momentum, 'wd': 0.00001})