example/reinforcement-learning/ddpg/qfuncs.py - mxnet-test - Git at Google

 from utils import define_qfunc
 import mxnet as mx


 class QFunc(object):
     """
     Base class for Q-Value Function.
     """

     def __init__(self, env_spec):

         self.env_spec = env_spec

     def get_qvals(self, obs, act):

         raise NotImplementedError


 class ContinuousMLPQ(QFunc):
     """
     Continous Multi-Layer Perceptron Q-Value Network
     for determnistic policy training.
     """

     def __init__(
         self,
         env_spec):

         super(ContinuousMLPQ, self).__init__(env_spec)

         self.obs = mx.symbol.Variable("obs")
         self.act = mx.symbol.Variable("act")
         self.qval = define_qfunc(self.obs, self.act)
         self.yval = mx.symbol.Variable("yval")

     def get_output_symbol(self):

         return self.qval

     def get_loss_symbols(self):

         return {"qval": self.qval,
                 "yval": self.yval}

     def define_loss(self, loss_exp):

         self.loss = mx.symbol.MakeLoss(loss_exp, name="qfunc_loss")
         self.loss = mx.symbol.Group([self.loss, mx.symbol.BlockGrad(self.qval)])

     def define_exe(self, ctx, init, updater, input_shapes=None, args=None,
                     grad_req=None):

         # define an executor, initializer and updater for batch version loss
         self.exe = self.loss.simple_bind(ctx=ctx, **input_shapes)
         self.arg_arrays = self.exe.arg_arrays
         self.grad_arrays = self.exe.grad_arrays
         self.arg_dict = self.exe.arg_dict

         for name, arr in self.arg_dict.items():
             if name not in input_shapes:
                 init(name, arr)

         self.updater = updater

     def update_params(self, obs, act, yval):

         self.arg_dict["obs"][:] = obs
         self.arg_dict["act"][:] = act
         self.arg_dict["yval"][:] = yval

         self.exe.forward(is_train=True)
         self.exe.backward()

         for i, pair in enumerate(zip(self.arg_arrays, self.grad_arrays)):
             weight, grad = pair
             self.updater(i, grad, weight)

     def get_qvals(self, obs, act):

         self.exe.arg_dict["obs"][:] = obs
         self.exe.arg_dict["act"][:] = act
         self.exe.forward(is_train=False)

         return self.exe.outputs[1].asnumpy()
	from utils import define_qfunc
	import mxnet as mx


	class QFunc(object):
	"""
	Base class for Q-Value Function.
	"""

	def __init__(self, env_spec):

	self.env_spec = env_spec

	def get_qvals(self, obs, act):

	raise NotImplementedError


	class ContinuousMLPQ(QFunc):
	"""
	Continous Multi-Layer Perceptron Q-Value Network
	for determnistic policy training.
	"""

	def __init__(
	self,
	env_spec):

	super(ContinuousMLPQ, self).__init__(env_spec)

	self.obs = mx.symbol.Variable("obs")
	self.act = mx.symbol.Variable("act")
	self.qval = define_qfunc(self.obs, self.act)
	self.yval = mx.symbol.Variable("yval")

	def get_output_symbol(self):

	return self.qval

	def get_loss_symbols(self):

	return {"qval": self.qval,
	"yval": self.yval}

	def define_loss(self, loss_exp):

	self.loss = mx.symbol.MakeLoss(loss_exp, name="qfunc_loss")
	self.loss = mx.symbol.Group([self.loss, mx.symbol.BlockGrad(self.qval)])

	def define_exe(self, ctx, init, updater, input_shapes=None, args=None,
	grad_req=None):

	# define an executor, initializer and updater for batch version loss
	self.exe = self.loss.simple_bind(ctx=ctx, **input_shapes)
	self.arg_arrays = self.exe.arg_arrays
	self.grad_arrays = self.exe.grad_arrays
	self.arg_dict = self.exe.arg_dict

	for name, arr in self.arg_dict.items():
	if name not in input_shapes:
	init(name, arr)

	self.updater = updater

	def update_params(self, obs, act, yval):

	self.arg_dict["obs"][:] = obs
	self.arg_dict["act"][:] = act
	self.arg_dict["yval"][:] = yval

	self.exe.forward(is_train=True)
	self.exe.backward()

	for i, pair in enumerate(zip(self.arg_arrays, self.grad_arrays)):
	weight, grad = pair
	self.updater(i, grad, weight)

	def get_qvals(self, obs, act):

	self.exe.arg_dict["obs"][:] = obs
	self.exe.arg_dict["act"][:] = act
	self.exe.forward(is_train=False)

	return self.exe.outputs[1].asnumpy()