example/svrg_module/linear_regression/common.py - mxnet - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.


 import mxnet as mx
 import logging
 from mxnet.contrib.svrg_optimization.svrg_module import SVRGModule


 def create_lin_reg_network(train_features, train_labels, feature_dim, batch_size, update_freq, ctx, logger):
     # fit a linear regression model with mxnet SVRGModule
     print("Fitting linear regression with mxnet")
     train_iter = mx.io.NDArrayIter(train_features, train_labels, batch_size=batch_size, shuffle=True,
                                    data_name='data', label_name='label')
     data = mx.sym.Variable("data")
     label = mx.sym.Variable("label")
     weight = mx.sym.Variable("fc_weight", shape=(1, feature_dim))
     net = mx.sym.dot(data, weight.transpose())
     bias = mx.sym.Variable("fc_bias", shape=(1,), wd_mult=0.0, lr_mult=10.0)
     net = mx.sym.broadcast_plus(net, bias)
     net = mx.sym.LinearRegressionOutput(data=net, label=label)
     mod = SVRGModule(symbol=net, context=ctx, data_names=['data'], label_names=['label'], logger=logger,
                      update_freq=update_freq)
     return train_iter, mod


 def create_metrics(metrics):
     metric = mx.metric.create(metrics)
     return metric


 def create_logger():
     logger = logging.getLogger('sgd_svrg')
     logger.setLevel(logging.INFO)
     formatter = logging.Formatter('%(asctime)s - %(message)s')
     fh = logging.FileHandler('experiments.log')
     fh.setFormatter(formatter)
     logger.addHandler(fh)
     return logger


 ################################################################################
 # Functions below are for benchmark purpose to calcuate expectation, variance of
 # gradients per epoch for each parameter. These calculations will be helpful when
 # benchmarking SVRG optimization with other optimization techniques, such as SGD.
 # Currently it only calculates the expectation, variance for single context but
 # can be extended to multi-context in later iterations.
 ################################################################################

 def accumulate_grad(grad_dict, mod):
     param_names = mod._exec_group.param_names

     for index, name in enumerate(param_names):
         if name not in grad_dict:
             grad_dict[name] = mod._exec_group.grad_arrays[index][0].copy()
         else:
             grad_dict[name] = mx.ndarray.concat(grad_dict[name], mod._exec_group.grad_arrays[index][0], dim=0)


 def calc_expectation(grad_dict, num_batches):
     """Calculates the expectation of the gradients per epoch for each parameter w.r.t number of batches

     Parameters
     ----------
     grad_dict: dict
         dictionary that maps parameter name to gradients in the mod executor group
     num_batches: int
         number of batches

     Returns
     ----------
     grad_dict: dict
         dictionary with new keys mapping to gradients expectations

     """
     for key in grad_dict.keys():
         grad_dict[str.format(key+"_expectation")] = mx.ndarray.sum(grad_dict[key], axis=0) / num_batches

     return grad_dict


 def calc_variance(grad_dict, num_batches, param_names):
     """Calculates the variance of the gradients per epoch for each parameter w.r.t number of batches

     Parameters
     ----------
     grad_dict: dict
         dictionary that maps parameter name to gradients in the mod executor group
     num_batches: int
         number of batches
     param_names: str
         parameter name in the module

     Returns
     ----------
     grad_dict: dict
         dictionary with new keys mapping to gradients variance

     """
     for i in range(len(param_names)):
         diff_sqr = mx.ndarray.square(mx.nd.subtract(grad_dict[param_names[i]],
                                                     grad_dict[str.format(param_names[i]+"_expectation")]))
         grad_dict[str.format(param_names[i] + "_variance")] = mx.ndarray.sum(diff_sqr, axis=0) / num_batches
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.


	import mxnet as mx
	import logging
	from mxnet.contrib.svrg_optimization.svrg_module import SVRGModule


	def create_lin_reg_network(train_features, train_labels, feature_dim, batch_size, update_freq, ctx, logger):
	# fit a linear regression model with mxnet SVRGModule
	print("Fitting linear regression with mxnet")
	train_iter = mx.io.NDArrayIter(train_features, train_labels, batch_size=batch_size, shuffle=True,
	data_name='data', label_name='label')
	data = mx.sym.Variable("data")
	label = mx.sym.Variable("label")
	weight = mx.sym.Variable("fc_weight", shape=(1, feature_dim))
	net = mx.sym.dot(data, weight.transpose())
	bias = mx.sym.Variable("fc_bias", shape=(1,), wd_mult=0.0, lr_mult=10.0)
	net = mx.sym.broadcast_plus(net, bias)
	net = mx.sym.LinearRegressionOutput(data=net, label=label)
	mod = SVRGModule(symbol=net, context=ctx, data_names=['data'], label_names=['label'], logger=logger,
	update_freq=update_freq)
	return train_iter, mod


	def create_metrics(metrics):
	metric = mx.metric.create(metrics)
	return metric


	def create_logger():
	logger = logging.getLogger('sgd_svrg')
	logger.setLevel(logging.INFO)
	formatter = logging.Formatter('%(asctime)s - %(message)s')
	fh = logging.FileHandler('experiments.log')
	fh.setFormatter(formatter)
	logger.addHandler(fh)
	return logger


	################################################################################
	# Functions below are for benchmark purpose to calcuate expectation, variance of
	# gradients per epoch for each parameter. These calculations will be helpful when
	# benchmarking SVRG optimization with other optimization techniques, such as SGD.
	# Currently it only calculates the expectation, variance for single context but
	# can be extended to multi-context in later iterations.
	################################################################################

	def accumulate_grad(grad_dict, mod):
	param_names = mod._exec_group.param_names

	for index, name in enumerate(param_names):
	if name not in grad_dict:
	grad_dict[name] = mod._exec_group.grad_arrays[index][0].copy()
	else:
	grad_dict[name] = mx.ndarray.concat(grad_dict[name], mod._exec_group.grad_arrays[index][0], dim=0)


	def calc_expectation(grad_dict, num_batches):
	"""Calculates the expectation of the gradients per epoch for each parameter w.r.t number of batches

	Parameters
	----------
	grad_dict: dict
	dictionary that maps parameter name to gradients in the mod executor group
	num_batches: int
	number of batches

	Returns
	----------
	grad_dict: dict
	dictionary with new keys mapping to gradients expectations

	"""
	for key in grad_dict.keys():
	grad_dict[str.format(key+"_expectation")] = mx.ndarray.sum(grad_dict[key], axis=0) / num_batches

	return grad_dict


	def calc_variance(grad_dict, num_batches, param_names):
	"""Calculates the variance of the gradients per epoch for each parameter w.r.t number of batches

	Parameters
	----------
	grad_dict: dict
	dictionary that maps parameter name to gradients in the mod executor group
	num_batches: int
	number of batches
	param_names: str
	parameter name in the module

	Returns
	----------
	grad_dict: dict
	dictionary with new keys mapping to gradients variance

	"""
	for i in range(len(param_names)):
	diff_sqr = mx.ndarray.square(mx.nd.subtract(grad_dict[param_names[i]],
	grad_dict[str.format(param_names[i]+"_expectation")]))
	grad_dict[str.format(param_names[i] + "_variance")] = mx.ndarray.sum(diff_sqr, axis=0) / num_batches