example/bayesian-methods/bdk_demo.py - mxnet-test - Git at Google

 from __future__ import print_function
 import mxnet as mx
 import mxnet.ndarray as nd
 import numpy
 import logging
 import matplotlib.pyplot as plt
 from scipy.stats import gaussian_kde
 import argparse
 from algos import *
 from data_loader import *
 from utils import *


 class CrossEntropySoftmax(mx.operator.NumpyOp):
     def __init__(self):
         super(CrossEntropySoftmax, self).__init__(False)

     def list_arguments(self):
         return ['data', 'label']

     def list_outputs(self):
         return ['output']

     def infer_shape(self, in_shape):
         data_shape = in_shape[0]
         label_shape = in_shape[0]
         output_shape = in_shape[0]
         return [data_shape, label_shape], [output_shape]

     def forward(self, in_data, out_data):
         x = in_data[0]
         y = out_data[0]
         y[:] = numpy.exp(x - x.max(axis=1).reshape((x.shape[0], 1))).astype('float32')
         y /= y.sum(axis=1).reshape((x.shape[0], 1))

     def backward(self, out_grad, in_data, out_data, in_grad):
         l = in_data[1]
         y = out_data[0]
         dx = in_grad[0]
         dx[:] = (y - l)


 class LogSoftmax(mx.operator.NumpyOp):
     def __init__(self):
         super(LogSoftmax, self).__init__(False)

     def list_arguments(self):
         return ['data', 'label']

     def list_outputs(self):
         return ['output']

     def infer_shape(self, in_shape):
         data_shape = in_shape[0]
         label_shape = in_shape[0]
         output_shape = in_shape[0]
         return [data_shape, label_shape], [output_shape]

     def forward(self, in_data, out_data):
         x = in_data[0]
         y = out_data[0]
         y[:] = (x - x.max(axis=1, keepdims=True)).astype('float32')
         y -= numpy.log(numpy.exp(y).sum(axis=1, keepdims=True)).astype('float32')
         # y[:] = numpy.exp(x - x.max(axis=1).reshape((x.shape[0], 1)))
         # y /= y.sum(axis=1).reshape((x.shape[0], 1))

     def backward(self, out_grad, in_data, out_data, in_grad):
         l = in_data[1]
         y = out_data[0]
         dx = in_grad[0]
         dx[:] = (numpy.exp(y) - l).astype('float32')


 def classification_student_grad(student_outputs, teacher_pred):
     return [student_outputs[0] - teacher_pred]


 def regression_student_grad(student_outputs, teacher_pred, teacher_noise_precision):
     student_mean = student_outputs[0]
     student_var = student_outputs[1]
     grad_mean = nd.exp(-student_var) * (student_mean - teacher_pred)

     grad_var = (1 - nd.exp(-student_var) * (nd.square(student_mean - teacher_pred)
                                             + 1.0 / teacher_noise_precision)) / 2
     return [grad_mean, grad_var]


 def get_mnist_sym(output_op=None, num_hidden=400):
     net = mx.symbol.Variable('data')
     net = mx.symbol.FullyConnected(data=net, name='mnist_fc1', num_hidden=num_hidden)
     net = mx.symbol.Activation(data=net, name='mnist_relu1', act_type="relu")
     net = mx.symbol.FullyConnected(data=net, name='mnist_fc2', num_hidden=num_hidden)
     net = mx.symbol.Activation(data=net, name='mnist_relu2', act_type="relu")
     net = mx.symbol.FullyConnected(data=net, name='mnist_fc3', num_hidden=10)
     if output_op is None:
         net = mx.symbol.SoftmaxOutput(data=net, name='softmax')
     else:
         net = output_op(data=net, name='softmax')
     return net


 def synthetic_grad(X, theta, sigma1, sigma2, sigmax, rescale_grad=1.0, grad=None):
     if grad is None:
         grad = nd.empty(theta.shape, theta.context)
     theta1 = theta.asnumpy()[0]
     theta2 = theta.asnumpy()[1]
     v1 = sigma1 ** 2
     v2 = sigma2 ** 2
     vx = sigmax ** 2
     denominator = numpy.exp(-(X - theta1) ** 2 / (2 * vx)) + numpy.exp(
         -(X - theta1 - theta2) ** 2 / (2 * vx))
     grad_npy = numpy.zeros(theta.shape)
     grad_npy[0] = -rescale_grad * ((numpy.exp(-(X - theta1) ** 2 / (2 * vx)) * (X - theta1) / vx
                                     + numpy.exp(-(X - theta1 - theta2) ** 2 / (2 * vx)) * (
                                     X - theta1 - theta2) / vx) / denominator).sum() \
                   + theta1 / v1
     grad_npy[1] = -rescale_grad * ((numpy.exp(-(X - theta1 - theta2) ** 2 / (2 * vx)) * (
     X - theta1 - theta2) / vx) / denominator).sum() \
                   + theta2 / v2
     grad[:] = grad_npy
     return grad


 def get_toy_sym(teacher=True, teacher_noise_precision=None):
     if teacher:
         net = mx.symbol.Variable('data')
         net = mx.symbol.FullyConnected(data=net, name='teacher_fc1', num_hidden=100)
         net = mx.symbol.Activation(data=net, name='teacher_relu1', act_type="relu")
         net = mx.symbol.FullyConnected(data=net, name='teacher_fc2', num_hidden=1)
         net = mx.symbol.LinearRegressionOutput(data=net, name='teacher_output',
                                                grad_scale=teacher_noise_precision)
     else:
         net = mx.symbol.Variable('data')
         net = mx.symbol.FullyConnected(data=net, name='student_fc1', num_hidden=100)
         net = mx.symbol.Activation(data=net, name='student_relu1', act_type="relu")
         student_mean = mx.symbol.FullyConnected(data=net, name='student_mean', num_hidden=1)
         student_var = mx.symbol.FullyConnected(data=net, name='student_var', num_hidden=1)
         net = mx.symbol.Group([student_mean, student_var])
     return net


 def dev():
     return mx.gpu()


 def run_mnist_SGD(training_num=50000):
     X, Y, X_test, Y_test = load_mnist(training_num)
     minibatch_size = 100
     net = get_mnist_sym()
     data_shape = (minibatch_size,) + X.shape[1::]
     data_inputs = {'data': nd.zeros(data_shape, ctx=dev()),
                    'softmax_label': nd.zeros((minibatch_size,), ctx=dev())}
     initializer = mx.init.Xavier(factor_type="in", magnitude=2.34)
     exe, exe_params, _ = SGD(sym=net, dev=dev(), data_inputs=data_inputs, X=X, Y=Y,
                              X_test=X_test, Y_test=Y_test,
                              total_iter_num=1000000,
                              initializer=initializer,
                              lr=5E-6, prior_precision=1.0, minibatch_size=100)


 def run_mnist_SGLD(training_num=50000):
     X, Y, X_test, Y_test = load_mnist(training_num)
     minibatch_size = 100
     net = get_mnist_sym()
     data_shape = (minibatch_size,) + X.shape[1::]
     data_inputs = {'data': nd.zeros(data_shape, ctx=dev()),
                    'softmax_label': nd.zeros((minibatch_size,), ctx=dev())}
     initializer = mx.init.Xavier(factor_type="in", magnitude=2.34)
     exe, sample_pool = SGLD(sym=net, dev=dev(), data_inputs=data_inputs, X=X, Y=Y,
                             X_test=X_test, Y_test=Y_test,
                             total_iter_num=1000000,
                             initializer=initializer,
                             learning_rate=4E-6, prior_precision=1.0, minibatch_size=100,
                             thin_interval=100, burn_in_iter_num=1000)


 def run_mnist_DistilledSGLD(training_num=50000):
     X, Y, X_test, Y_test = load_mnist(training_num)
     minibatch_size = 100
     if training_num >= 10000:
         num_hidden = 800
         total_iter_num = 1000000
         teacher_learning_rate = 1E-6
         student_learning_rate = 0.0001
         teacher_prior = 1
         student_prior = 0.1
         perturb_deviation = 0.1
     else:
         num_hidden = 400
         total_iter_num = 20000
         teacher_learning_rate = 4E-5
         student_learning_rate = 0.0001
         teacher_prior = 1
         student_prior = 0.1
         perturb_deviation = 0.001
     teacher_net = get_mnist_sym(num_hidden=num_hidden)
     logsoftmax = LogSoftmax()
     student_net = get_mnist_sym(output_op=logsoftmax, num_hidden=num_hidden)
     data_shape = (minibatch_size,) + X.shape[1::]
     teacher_data_inputs = {'data': nd.zeros(data_shape, ctx=dev()),
                            'softmax_label': nd.zeros((minibatch_size,), ctx=dev())}
     student_data_inputs = {'data': nd.zeros(data_shape, ctx=dev()),
                            'softmax_label': nd.zeros((minibatch_size, 10), ctx=dev())}
     teacher_initializer = BiasXavier(factor_type="in", magnitude=1)
     student_initializer = BiasXavier(factor_type="in", magnitude=1)
     student_exe, student_params, _ = \
         DistilledSGLD(teacher_sym=teacher_net, student_sym=student_net,
                       teacher_data_inputs=teacher_data_inputs,
                       student_data_inputs=student_data_inputs,
                       X=X, Y=Y, X_test=X_test, Y_test=Y_test, total_iter_num=total_iter_num,
                       student_initializer=student_initializer,
                       teacher_initializer=teacher_initializer,
                       student_optimizing_algorithm="adam",
                       teacher_learning_rate=teacher_learning_rate,
                       student_learning_rate=student_learning_rate,
                       teacher_prior_precision=teacher_prior, student_prior_precision=student_prior,
                       perturb_deviation=perturb_deviation, minibatch_size=100, dev=dev())


 def run_toy_SGLD():
     X, Y, X_test, Y_test = load_toy()
     minibatch_size = 1
     teacher_noise_precision = 1.0 / 9.0
     net = get_toy_sym(True, teacher_noise_precision)
     data_shape = (minibatch_size,) + X.shape[1::]
     data_inputs = {'data': nd.zeros(data_shape, ctx=dev()),
                    'teacher_output_label': nd.zeros((minibatch_size, 1), ctx=dev())}
     initializer = mx.init.Uniform(0.07)
     exe, params, _ = \
         SGLD(sym=net, data_inputs=data_inputs,
              X=X, Y=Y, X_test=X_test, Y_test=Y_test, total_iter_num=50000,
              initializer=initializer,
              learning_rate=1E-4,
              #         lr_scheduler=mx.lr_scheduler.FactorScheduler(100000, 0.5),
              prior_precision=0.1,
              burn_in_iter_num=1000,
              thin_interval=10,
              task='regression',
              minibatch_size=minibatch_size, dev=dev())


 def run_toy_DistilledSGLD():
     X, Y, X_test, Y_test = load_toy()
     minibatch_size = 1
     teacher_noise_precision = 1.0
     teacher_net = get_toy_sym(True, teacher_noise_precision)
     student_net = get_toy_sym(False)
     data_shape = (minibatch_size,) + X.shape[1::]
     teacher_data_inputs = {'data': nd.zeros(data_shape, ctx=dev()),
                            'teacher_output_label': nd.zeros((minibatch_size, 1), ctx=dev())}
     student_data_inputs = {'data': nd.zeros(data_shape, ctx=dev())}
     #                   'softmax_label': nd.zeros((minibatch_size, 10), ctx=dev())}
     teacher_initializer = mx.init.Uniform(0.07)
     student_initializer = mx.init.Uniform(0.07)
     student_grad_f = lambda student_outputs, teacher_pred: \
         regression_student_grad(student_outputs, teacher_pred, teacher_noise_precision)
     student_exe, student_params, _ = \
         DistilledSGLD(teacher_sym=teacher_net, student_sym=student_net,
                       teacher_data_inputs=teacher_data_inputs,
                       student_data_inputs=student_data_inputs,
                       X=X, Y=Y, X_test=X_test, Y_test=Y_test, total_iter_num=80000,
                       teacher_initializer=teacher_initializer,
                       student_initializer=student_initializer,
                       teacher_learning_rate=1E-4, student_learning_rate=0.01,
                       # teacher_lr_scheduler=mx.lr_scheduler.FactorScheduler(100000, 0.5),
                       student_lr_scheduler=mx.lr_scheduler.FactorScheduler(8000, 0.8),
                       student_grad_f=student_grad_f,
                       teacher_prior_precision=0.1, student_prior_precision=0.001,
                       perturb_deviation=0.1, minibatch_size=minibatch_size, task='regression',
                       dev=dev())


 def run_toy_HMC():
     X, Y, X_test, Y_test = load_toy()
     minibatch_size = Y.shape[0]
     noise_precision = 1 / 9.0
     net = get_toy_sym(True, noise_precision)
     data_shape = (minibatch_size,) + X.shape[1::]
     data_inputs = {'data': nd.zeros(data_shape, ctx=dev()),
                    'teacher_output_label': nd.zeros((minibatch_size, 1), ctx=dev())}
     initializer = mx.init.Uniform(0.07)
     sample_pool = HMC(net, data_inputs=data_inputs, X=X, Y=Y, X_test=X_test, Y_test=Y_test,
                       sample_num=300000, initializer=initializer, prior_precision=1.0,
                       learning_rate=1E-3, L=10, dev=dev())


 def run_synthetic_SGLD():
     theta1 = 0
     theta2 = 1
     sigma1 = numpy.sqrt(10)
     sigma2 = 1
     sigmax = numpy.sqrt(2)
     X = load_synthetic(theta1=theta1, theta2=theta2, sigmax=sigmax, num=100)
     minibatch_size = 1
     total_iter_num = 1000000
     lr_scheduler = SGLDScheduler(begin_rate=0.01, end_rate=0.0001, total_iter_num=total_iter_num,
                                  factor=0.55)
     optimizer = mx.optimizer.create('sgld',
                                     learning_rate=None,
                                     rescale_grad=1.0,
                                     lr_scheduler=lr_scheduler,
                                     wd=0)
     updater = mx.optimizer.get_updater(optimizer)
     theta = mx.random.normal(0, 1, (2,), mx.cpu())
     grad = nd.empty((2,), mx.cpu())
     samples = numpy.zeros((2, total_iter_num))
     start = time.time()
     for i in xrange(total_iter_num):
         if (i + 1) % 100000 == 0:
             end = time.time()
             print("Iter:%d, Time spent: %f" % (i + 1, end - start))
             start = time.time()
         ind = numpy.random.randint(0, X.shape[0])
         synthetic_grad(X[ind], theta, sigma1, sigma2, sigmax, rescale_grad=
         X.shape[0] / float(minibatch_size), grad=grad)
         updater('theta', grad, theta)
         samples[:, i] = theta.asnumpy()
     plt.hist2d(samples[0, :], samples[1, :], (200, 200), cmap=plt.cm.jet)
     plt.colorbar()
     plt.show()


 if __name__ == '__main__':
     numpy.random.seed(100)
     mx.random.seed(100)
     parser = argparse.ArgumentParser(
         description="Examples in the paper [NIPS2015]Bayesian Dark Knowledge and "
                     "[ICML2011]Bayesian Learning via Stochastic Gradient Langevin Dynamics")
     parser.add_argument("-d", "--dataset", type=int, default=1,
                         help="Dataset to use. 0 --> TOY, 1 --> MNIST, 2 --> Synthetic Data in "
                              "the SGLD paper")
     parser.add_argument("-l", "--algorithm", type=int, default=2,
                         help="Type of algorithm to use. 0 --> SGD, 1 --> SGLD, other-->DistilledSGLD")
     parser.add_argument("-t", "--training", type=int, default=50000,
                         help="Number of training samples")
     args = parser.parse_args()
     training_num = args.training
     if args.dataset == 1:
         if 0 == args.algorithm:
             run_mnist_SGD(training_num)
         elif 1 == args.algorithm:
             run_mnist_SGLD(training_num)
         else:
             run_mnist_DistilledSGLD(training_num)
     elif args.dataset == 0:
         if 1 == args.algorithm:
             run_toy_SGLD()
         elif 2 == args.algorithm:
             run_toy_DistilledSGLD()
         elif 3 == args.algorithm:
             run_toy_HMC()
     else:
         run_synthetic_SGLD()
	from __future__ import print_function
	import mxnet as mx
	import mxnet.ndarray as nd
	import numpy
	import logging
	import matplotlib.pyplot as plt
	from scipy.stats import gaussian_kde
	import argparse
	from algos import *
	from data_loader import *
	from utils import *


	class CrossEntropySoftmax(mx.operator.NumpyOp):
	def __init__(self):
	super(CrossEntropySoftmax, self).__init__(False)

	def list_arguments(self):
	return ['data', 'label']

	def list_outputs(self):
	return ['output']

	def infer_shape(self, in_shape):
	data_shape = in_shape[0]
	label_shape = in_shape[0]
	output_shape = in_shape[0]
	return [data_shape, label_shape], [output_shape]

	def forward(self, in_data, out_data):
	x = in_data[0]
	y = out_data[0]
	y[:] = numpy.exp(x - x.max(axis=1).reshape((x.shape[0], 1))).astype('float32')
	y /= y.sum(axis=1).reshape((x.shape[0], 1))

	def backward(self, out_grad, in_data, out_data, in_grad):
	l = in_data[1]
	y = out_data[0]
	dx = in_grad[0]
	dx[:] = (y - l)


	class LogSoftmax(mx.operator.NumpyOp):
	def __init__(self):
	super(LogSoftmax, self).__init__(False)

	def list_arguments(self):
	return ['data', 'label']

	def list_outputs(self):
	return ['output']

	def infer_shape(self, in_shape):
	data_shape = in_shape[0]
	label_shape = in_shape[0]
	output_shape = in_shape[0]
	return [data_shape, label_shape], [output_shape]

	def forward(self, in_data, out_data):
	x = in_data[0]
	y = out_data[0]
	y[:] = (x - x.max(axis=1, keepdims=True)).astype('float32')
	y -= numpy.log(numpy.exp(y).sum(axis=1, keepdims=True)).astype('float32')
	# y[:] = numpy.exp(x - x.max(axis=1).reshape((x.shape[0], 1)))
	# y /= y.sum(axis=1).reshape((x.shape[0], 1))

	def backward(self, out_grad, in_data, out_data, in_grad):
	l = in_data[1]
	y = out_data[0]
	dx = in_grad[0]
	dx[:] = (numpy.exp(y) - l).astype('float32')


	def classification_student_grad(student_outputs, teacher_pred):
	return [student_outputs[0] - teacher_pred]


	def regression_student_grad(student_outputs, teacher_pred, teacher_noise_precision):
	student_mean = student_outputs[0]
	student_var = student_outputs[1]
	grad_mean = nd.exp(-student_var) * (student_mean - teacher_pred)

	grad_var = (1 - nd.exp(-student_var) * (nd.square(student_mean - teacher_pred)
	+ 1.0 / teacher_noise_precision)) / 2
	return [grad_mean, grad_var]


	def get_mnist_sym(output_op=None, num_hidden=400):
	net = mx.symbol.Variable('data')
	net = mx.symbol.FullyConnected(data=net, name='mnist_fc1', num_hidden=num_hidden)
	net = mx.symbol.Activation(data=net, name='mnist_relu1', act_type="relu")
	net = mx.symbol.FullyConnected(data=net, name='mnist_fc2', num_hidden=num_hidden)
	net = mx.symbol.Activation(data=net, name='mnist_relu2', act_type="relu")
	net = mx.symbol.FullyConnected(data=net, name='mnist_fc3', num_hidden=10)
	if output_op is None:
	net = mx.symbol.SoftmaxOutput(data=net, name='softmax')
	else:
	net = output_op(data=net, name='softmax')
	return net


	def synthetic_grad(X, theta, sigma1, sigma2, sigmax, rescale_grad=1.0, grad=None):
	if grad is None:
	grad = nd.empty(theta.shape, theta.context)
	theta1 = theta.asnumpy()[0]
	theta2 = theta.asnumpy()[1]
	v1 = sigma1 ** 2
	v2 = sigma2 ** 2
	vx = sigmax ** 2
	denominator = numpy.exp(-(X - theta1) ** 2 / (2 * vx)) + numpy.exp(
	-(X - theta1 - theta2) ** 2 / (2 * vx))
	grad_npy = numpy.zeros(theta.shape)
	grad_npy[0] = -rescale_grad * ((numpy.exp(-(X - theta1) ** 2 / (2 * vx)) * (X - theta1) / vx
	+ numpy.exp(-(X - theta1 - theta2) ** 2 / (2 * vx)) * (
	X - theta1 - theta2) / vx) / denominator).sum() \
	+ theta1 / v1
	grad_npy[1] = -rescale_grad * ((numpy.exp(-(X - theta1 - theta2) ** 2 / (2 * vx)) * (
	X - theta1 - theta2) / vx) / denominator).sum() \
	+ theta2 / v2
	grad[:] = grad_npy
	return grad


	def get_toy_sym(teacher=True, teacher_noise_precision=None):
	if teacher:
	net = mx.symbol.Variable('data')
	net = mx.symbol.FullyConnected(data=net, name='teacher_fc1', num_hidden=100)
	net = mx.symbol.Activation(data=net, name='teacher_relu1', act_type="relu")
	net = mx.symbol.FullyConnected(data=net, name='teacher_fc2', num_hidden=1)
	net = mx.symbol.LinearRegressionOutput(data=net, name='teacher_output',
	grad_scale=teacher_noise_precision)
	else:
	net = mx.symbol.Variable('data')
	net = mx.symbol.FullyConnected(data=net, name='student_fc1', num_hidden=100)
	net = mx.symbol.Activation(data=net, name='student_relu1', act_type="relu")
	student_mean = mx.symbol.FullyConnected(data=net, name='student_mean', num_hidden=1)
	student_var = mx.symbol.FullyConnected(data=net, name='student_var', num_hidden=1)
	net = mx.symbol.Group([student_mean, student_var])
	return net


	def dev():
	return mx.gpu()


	def run_mnist_SGD(training_num=50000):
	X, Y, X_test, Y_test = load_mnist(training_num)
	minibatch_size = 100
	net = get_mnist_sym()
	data_shape = (minibatch_size,) + X.shape[1::]
	data_inputs = {'data': nd.zeros(data_shape, ctx=dev()),
	'softmax_label': nd.zeros((minibatch_size,), ctx=dev())}
	initializer = mx.init.Xavier(factor_type="in", magnitude=2.34)
	exe, exe_params, _ = SGD(sym=net, dev=dev(), data_inputs=data_inputs, X=X, Y=Y,
	X_test=X_test, Y_test=Y_test,
	total_iter_num=1000000,
	initializer=initializer,
	lr=5E-6, prior_precision=1.0, minibatch_size=100)


	def run_mnist_SGLD(training_num=50000):
	X, Y, X_test, Y_test = load_mnist(training_num)
	minibatch_size = 100
	net = get_mnist_sym()
	data_shape = (minibatch_size,) + X.shape[1::]
	data_inputs = {'data': nd.zeros(data_shape, ctx=dev()),
	'softmax_label': nd.zeros((minibatch_size,), ctx=dev())}
	initializer = mx.init.Xavier(factor_type="in", magnitude=2.34)
	exe, sample_pool = SGLD(sym=net, dev=dev(), data_inputs=data_inputs, X=X, Y=Y,
	X_test=X_test, Y_test=Y_test,
	total_iter_num=1000000,
	initializer=initializer,
	learning_rate=4E-6, prior_precision=1.0, minibatch_size=100,
	thin_interval=100, burn_in_iter_num=1000)


	def run_mnist_DistilledSGLD(training_num=50000):
	X, Y, X_test, Y_test = load_mnist(training_num)
	minibatch_size = 100
	if training_num >= 10000:
	num_hidden = 800
	total_iter_num = 1000000
	teacher_learning_rate = 1E-6
	student_learning_rate = 0.0001
	teacher_prior = 1
	student_prior = 0.1
	perturb_deviation = 0.1
	else:
	num_hidden = 400
	total_iter_num = 20000
	teacher_learning_rate = 4E-5
	student_learning_rate = 0.0001
	teacher_prior = 1
	student_prior = 0.1
	perturb_deviation = 0.001
	teacher_net = get_mnist_sym(num_hidden=num_hidden)
	logsoftmax = LogSoftmax()
	student_net = get_mnist_sym(output_op=logsoftmax, num_hidden=num_hidden)
	data_shape = (minibatch_size,) + X.shape[1::]
	teacher_data_inputs = {'data': nd.zeros(data_shape, ctx=dev()),
	'softmax_label': nd.zeros((minibatch_size,), ctx=dev())}
	student_data_inputs = {'data': nd.zeros(data_shape, ctx=dev()),
	'softmax_label': nd.zeros((minibatch_size, 10), ctx=dev())}
	teacher_initializer = BiasXavier(factor_type="in", magnitude=1)
	student_initializer = BiasXavier(factor_type="in", magnitude=1)
	student_exe, student_params, _ = \
	DistilledSGLD(teacher_sym=teacher_net, student_sym=student_net,
	teacher_data_inputs=teacher_data_inputs,
	student_data_inputs=student_data_inputs,
	X=X, Y=Y, X_test=X_test, Y_test=Y_test, total_iter_num=total_iter_num,
	student_initializer=student_initializer,
	teacher_initializer=teacher_initializer,
	student_optimizing_algorithm="adam",
	teacher_learning_rate=teacher_learning_rate,
	student_learning_rate=student_learning_rate,
	teacher_prior_precision=teacher_prior, student_prior_precision=student_prior,
	perturb_deviation=perturb_deviation, minibatch_size=100, dev=dev())


	def run_toy_SGLD():
	X, Y, X_test, Y_test = load_toy()
	minibatch_size = 1
	teacher_noise_precision = 1.0 / 9.0
	net = get_toy_sym(True, teacher_noise_precision)
	data_shape = (minibatch_size,) + X.shape[1::]
	data_inputs = {'data': nd.zeros(data_shape, ctx=dev()),
	'teacher_output_label': nd.zeros((minibatch_size, 1), ctx=dev())}
	initializer = mx.init.Uniform(0.07)
	exe, params, _ = \
	SGLD(sym=net, data_inputs=data_inputs,
	X=X, Y=Y, X_test=X_test, Y_test=Y_test, total_iter_num=50000,
	initializer=initializer,
	learning_rate=1E-4,
	# lr_scheduler=mx.lr_scheduler.FactorScheduler(100000, 0.5),
	prior_precision=0.1,
	burn_in_iter_num=1000,
	thin_interval=10,
	task='regression',
	minibatch_size=minibatch_size, dev=dev())


	def run_toy_DistilledSGLD():
	X, Y, X_test, Y_test = load_toy()
	minibatch_size = 1
	teacher_noise_precision = 1.0
	teacher_net = get_toy_sym(True, teacher_noise_precision)
	student_net = get_toy_sym(False)
	data_shape = (minibatch_size,) + X.shape[1::]
	teacher_data_inputs = {'data': nd.zeros(data_shape, ctx=dev()),
	'teacher_output_label': nd.zeros((minibatch_size, 1), ctx=dev())}
	student_data_inputs = {'data': nd.zeros(data_shape, ctx=dev())}
	# 'softmax_label': nd.zeros((minibatch_size, 10), ctx=dev())}
	teacher_initializer = mx.init.Uniform(0.07)
	student_initializer = mx.init.Uniform(0.07)
	student_grad_f = lambda student_outputs, teacher_pred: \
	regression_student_grad(student_outputs, teacher_pred, teacher_noise_precision)
	student_exe, student_params, _ = \
	DistilledSGLD(teacher_sym=teacher_net, student_sym=student_net,
	teacher_data_inputs=teacher_data_inputs,
	student_data_inputs=student_data_inputs,
	X=X, Y=Y, X_test=X_test, Y_test=Y_test, total_iter_num=80000,
	teacher_initializer=teacher_initializer,
	student_initializer=student_initializer,
	teacher_learning_rate=1E-4, student_learning_rate=0.01,
	# teacher_lr_scheduler=mx.lr_scheduler.FactorScheduler(100000, 0.5),
	student_lr_scheduler=mx.lr_scheduler.FactorScheduler(8000, 0.8),
	student_grad_f=student_grad_f,
	teacher_prior_precision=0.1, student_prior_precision=0.001,
	perturb_deviation=0.1, minibatch_size=minibatch_size, task='regression',
	dev=dev())


	def run_toy_HMC():
	X, Y, X_test, Y_test = load_toy()
	minibatch_size = Y.shape[0]
	noise_precision = 1 / 9.0
	net = get_toy_sym(True, noise_precision)
	data_shape = (minibatch_size,) + X.shape[1::]
	data_inputs = {'data': nd.zeros(data_shape, ctx=dev()),
	'teacher_output_label': nd.zeros((minibatch_size, 1), ctx=dev())}
	initializer = mx.init.Uniform(0.07)
	sample_pool = HMC(net, data_inputs=data_inputs, X=X, Y=Y, X_test=X_test, Y_test=Y_test,
	sample_num=300000, initializer=initializer, prior_precision=1.0,
	learning_rate=1E-3, L=10, dev=dev())


	def run_synthetic_SGLD():
	theta1 = 0
	theta2 = 1
	sigma1 = numpy.sqrt(10)
	sigma2 = 1
	sigmax = numpy.sqrt(2)
	X = load_synthetic(theta1=theta1, theta2=theta2, sigmax=sigmax, num=100)
	minibatch_size = 1
	total_iter_num = 1000000
	lr_scheduler = SGLDScheduler(begin_rate=0.01, end_rate=0.0001, total_iter_num=total_iter_num,
	factor=0.55)
	optimizer = mx.optimizer.create('sgld',
	learning_rate=None,
	rescale_grad=1.0,
	lr_scheduler=lr_scheduler,
	wd=0)
	updater = mx.optimizer.get_updater(optimizer)
	theta = mx.random.normal(0, 1, (2,), mx.cpu())
	grad = nd.empty((2,), mx.cpu())
	samples = numpy.zeros((2, total_iter_num))
	start = time.time()
	for i in xrange(total_iter_num):
	if (i + 1) % 100000 == 0:
	end = time.time()
	print("Iter:%d, Time spent: %f" % (i + 1, end - start))
	start = time.time()
	ind = numpy.random.randint(0, X.shape[0])
	synthetic_grad(X[ind], theta, sigma1, sigma2, sigmax, rescale_grad=
	X.shape[0] / float(minibatch_size), grad=grad)
	updater('theta', grad, theta)
	samples[:, i] = theta.asnumpy()
	plt.hist2d(samples[0, :], samples[1, :], (200, 200), cmap=plt.cm.jet)
	plt.colorbar()
	plt.show()


	if __name__ == '__main__':
	numpy.random.seed(100)
	mx.random.seed(100)
	parser = argparse.ArgumentParser(
	description="Examples in the paper [NIPS2015]Bayesian Dark Knowledge and "
	"[ICML2011]Bayesian Learning via Stochastic Gradient Langevin Dynamics")
	parser.add_argument("-d", "--dataset", type=int, default=1,
	help="Dataset to use. 0 --> TOY, 1 --> MNIST, 2 --> Synthetic Data in "
	"the SGLD paper")
	parser.add_argument("-l", "--algorithm", type=int, default=2,
	help="Type of algorithm to use. 0 --> SGD, 1 --> SGLD, other-->DistilledSGLD")
	parser.add_argument("-t", "--training", type=int, default=50000,
	help="Number of training samples")
	args = parser.parse_args()
	training_num = args.training
	if args.dataset == 1:
	if 0 == args.algorithm:
	run_mnist_SGD(training_num)
	elif 1 == args.algorithm:
	run_mnist_SGLD(training_num)
	else:
	run_mnist_DistilledSGLD(training_num)
	elif args.dataset == 0:
	if 1 == args.algorithm:
	run_toy_SGLD()
	elif 2 == args.algorithm:
	run_toy_DistilledSGLD()
	elif 3 == args.algorithm:
	run_toy_HMC()
	else:
	run_synthetic_SGLD()