example/vae-gan/vaegan_mxnet.py - mxnet - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.

 '''
 Created on Jun 15, 2017

 @author: shujon
 '''

 from __future__ import print_function
 import logging
 from datetime import datetime
 import os
 import argparse
 import errno
 import mxnet as mx
 import numpy as np
 import cv2
 from scipy.io import savemat
 #from layer import GaussianSampleLayer

 ######################################################################
 #An adversarial variational autoencoder implementation in mxnet
 # following the implementation at https://github.com/JeremyCCHsu/tf-vaegan
 # of paper `Larsen, Anders Boesen Lindbo, et al. "Autoencoding beyond pixels using a
 # learned similarity metric." arXiv preprint arXiv:1512.09300 (2015).`
 ######################################################################

 @mx.init.register
 class MyConstant(mx.init.Initializer):
     '''constant operator in mxnet, no used in the code
     '''
     def __init__(self, value):
         super(MyConstant, self).__init__(value=value)
         self.value = value

     def _init_weight(self, _, arr):
         arr[:] = mx.nd.array(self.value)

 def encoder(nef, z_dim, batch_size, no_bias=True, fix_gamma=True, eps=1e-5 + 1e-12):
     '''The encoder is a CNN which takes 32x32 image as input
     generates the 100 dimensional shape embedding as a sample from normal distribution
     using predicted meand and variance
     '''
     BatchNorm = mx.sym.BatchNorm

     data = mx.sym.Variable('data')

     e1 = mx.sym.Convolution(data, name='enc1', kernel=(5,5), stride=(2,2), pad=(2,2), num_filter=nef, no_bias=no_bias)
     ebn1 = BatchNorm(e1, name='encbn1', fix_gamma=fix_gamma, eps=eps)
     eact1 = mx.sym.LeakyReLU(ebn1, name='encact1', act_type='leaky', slope=0.2)

     e2 = mx.sym.Convolution(eact1, name='enc2', kernel=(5,5), stride=(2,2), pad=(2,2), num_filter=nef*2, no_bias=no_bias)
     ebn2 = BatchNorm(e2, name='encbn2', fix_gamma=fix_gamma, eps=eps)
     eact2 = mx.sym.LeakyReLU(ebn2, name='encact2', act_type='leaky', slope=0.2)

     e3 = mx.sym.Convolution(eact2, name='enc3', kernel=(5,5), stride=(2,2), pad=(2,2), num_filter=nef*4, no_bias=no_bias)
     ebn3 = BatchNorm(e3, name='encbn3', fix_gamma=fix_gamma, eps=eps)
     eact3 = mx.sym.LeakyReLU(ebn3, name='encact3', act_type='leaky', slope=0.2)

     e4 = mx.sym.Convolution(eact3, name='enc4', kernel=(5,5), stride=(2,2), pad=(2,2), num_filter=nef*8, no_bias=no_bias)
     ebn4 = BatchNorm(e4, name='encbn4', fix_gamma=fix_gamma, eps=eps)
     eact4 = mx.sym.LeakyReLU(ebn4, name='encact4', act_type='leaky', slope=0.2)

     eact4 = mx.sym.Flatten(eact4)

     z_mu = mx.sym.FullyConnected(eact4, num_hidden=z_dim, name="enc_mu")
     z_lv = mx.sym.FullyConnected(eact4, num_hidden=z_dim, name="enc_lv")

     z = z_mu + mx.symbol.broadcast_mul(mx.symbol.exp(0.5*z_lv),mx.symbol.random_normal(loc=0, scale=1,shape=(batch_size,z_dim)))

     return z_mu, z_lv, z

 def generator(ngf, nc, no_bias=True, fix_gamma=True, eps=1e-5 + 1e-12, z_dim=100, activation='sigmoid'):
     '''The genrator is a CNN which takes 100 dimensional embedding as input
     and reconstructs the input image given to the encoder
     '''
     BatchNorm = mx.sym.BatchNorm
     rand = mx.sym.Variable('rand')

     rand = mx.sym.Reshape(rand, shape=(-1, z_dim, 1, 1))

     g1 = mx.sym.Deconvolution(rand, name='gen1', kernel=(5,5), stride=(2,2),target_shape=(2,2), num_filter=ngf*8, no_bias=no_bias)
     gbn1 = BatchNorm(g1, name='genbn1', fix_gamma=fix_gamma, eps=eps)
     gact1 = mx.sym.Activation(gbn1, name="genact1", act_type="relu")

     g2 = mx.sym.Deconvolution(gact1, name='gen2', kernel=(5,5), stride=(2,2),target_shape=(4,4), num_filter=ngf*4, no_bias=no_bias)
     gbn2 = BatchNorm(g2, name='genbn2', fix_gamma=fix_gamma, eps=eps)
     gact2 = mx.sym.Activation(gbn2, name='genact2', act_type='relu')

     g3 = mx.sym.Deconvolution(gact2, name='gen3', kernel=(5,5), stride=(2,2), target_shape=(8,8), num_filter=ngf*2, no_bias=no_bias)
     gbn3 = BatchNorm(g3, name='genbn3', fix_gamma=fix_gamma, eps=eps)
     gact3 = mx.sym.Activation(gbn3, name='genact3', act_type='relu')

     g4 = mx.sym.Deconvolution(gact3, name='gen4', kernel=(5,5), stride=(2,2), target_shape=(16,16), num_filter=ngf, no_bias=no_bias)
     gbn4 = BatchNorm(g4, name='genbn4', fix_gamma=fix_gamma, eps=eps)
     gact4 = mx.sym.Activation(gbn4, name='genact4', act_type='relu')

     g5 = mx.sym.Deconvolution(gact4, name='gen5', kernel=(5,5), stride=(2,2), target_shape=(32,32), num_filter=nc, no_bias=no_bias)
     gout = mx.sym.Activation(g5, name='genact5', act_type=activation)

     return gout

 def discriminator1(ndf, no_bias=True, fix_gamma=True, eps=1e-5 + 1e-12):
     '''First part of the discriminator which takes a 32x32 image as input
     and output a convolutional feature map, this is required to calculate
     the layer loss'''
     BatchNorm = mx.sym.BatchNorm

     data = mx.sym.Variable('data')

     d1 = mx.sym.Convolution(data, name='d1', kernel=(5,5), stride=(2,2), pad=(2,2), num_filter=ndf, no_bias=no_bias)
     dact1 = mx.sym.LeakyReLU(d1, name='dact1', act_type='leaky', slope=0.2)

     d2 = mx.sym.Convolution(dact1, name='d2', kernel=(5,5), stride=(2,2), pad=(2,2), num_filter=ndf*2, no_bias=no_bias)
     dbn2 = BatchNorm(d2, name='dbn2', fix_gamma=fix_gamma, eps=eps)
     dact2 = mx.sym.LeakyReLU(dbn2, name='dact2', act_type='leaky', slope=0.2)

     d3 = mx.sym.Convolution(dact2, name='d3', kernel=(5,5), stride=(2,2), pad=(2,2), num_filter=ndf*4, no_bias=no_bias)
     dbn3 = BatchNorm(d3, name='dbn3', fix_gamma=fix_gamma, eps=eps)
     dact3 = mx.sym.LeakyReLU(dbn3, name='dact3', act_type='leaky', slope=0.2)

     return dact3

 def discriminator2(ndf, no_bias=True, fix_gamma=True, eps=1e-5 + 1e-12):
     '''Second part of the discriminator which takes a 256x8x8 feature map as input
     and generates the loss based on whether the input image was a real one or fake one'''

     BatchNorm = mx.sym.BatchNorm

     data = mx.sym.Variable('data')

     label = mx.sym.Variable('label')

     d4 = mx.sym.Convolution(data, name='d4', kernel=(5,5), stride=(2,2), pad=(2,2), num_filter=ndf*8, no_bias=no_bias)
     dbn4 = BatchNorm(d4, name='dbn4', fix_gamma=fix_gamma, eps=eps)
     dact4 = mx.sym.LeakyReLU(dbn4, name='dact4', act_type='leaky', slope=0.2)

     h = mx.sym.Flatten(dact4)

     d5 = mx.sym.FullyConnected(h, num_hidden=1, name="d5")

     dloss = mx.sym.LogisticRegressionOutput(data=d5, label=label, name='dloss')

     return dloss

 def GaussianLogDensity(x, mu, log_var, name='GaussianLogDensity', EPSILON = 1e-6):
     '''GaussianLogDensity loss calculation for layer wise loss
     '''
     c = mx.sym.ones_like(log_var)*2.0 * 3.1416
     c = mx.symbol.log(c)
     var = mx.sym.exp(log_var)
     x_mu2 = mx.symbol.square(x - mu)   # [Issue] not sure the dim works or not?
     x_mu2_over_var = mx.symbol.broadcast_div(x_mu2, var + EPSILON)
     log_prob = -0.5 * (c + log_var + x_mu2_over_var)
     log_prob = mx.symbol.sum(log_prob, axis=1, name=name)   # keep_dims=True,
     return log_prob

 def DiscriminatorLayerLoss():
     '''Calculate the discriminator layer loss
     '''

     data = mx.sym.Variable('data')

     label = mx.sym.Variable('label')

     data = mx.sym.Flatten(data)
     label = mx.sym.Flatten(label)

     label = mx.sym.BlockGrad(label)

     zeros = mx.sym.zeros_like(data)

     output = -GaussianLogDensity(label, data, zeros)

     dloss = mx.symbol.MakeLoss(mx.symbol.mean(output),name='lloss')

     return dloss

 def KLDivergenceLoss():
     '''KLDivergenceLoss loss
     '''

     data = mx.sym.Variable('data')
     mu1, lv1 = mx.sym.split(data,  num_outputs=2, axis=0)
     mu2 = mx.sym.zeros_like(mu1)
     lv2 = mx.sym.zeros_like(lv1)

     v1 = mx.sym.exp(lv1)
     v2 = mx.sym.exp(lv2)
     mu_diff_sq = mx.sym.square(mu1 - mu2)
     dimwise_kld = .5 * (
     (lv2 - lv1) + mx.symbol.broadcast_div(v1, v2) + mx.symbol.broadcast_div(mu_diff_sq, v2) - 1.)
     KL = mx.symbol.sum(dimwise_kld, axis=1)

     KLloss = mx.symbol.MakeLoss(mx.symbol.mean(KL),name='KLloss')
     return KLloss

 def get_data(path, activation):
     '''Get the dataset
     '''
     data = []
     image_names = []
     for filename in os.listdir(path):
         img = cv2.imread(os.path.join(path,filename), cv2.IMREAD_GRAYSCALE)
         image_names.append(filename)
         if img is not None:
             data.append(img)

     data = np.asarray(data)

     if activation == 'sigmoid':
         data = data.astype(np.float32)/(255.0)
     elif activation == 'tanh':
         data = data.astype(np.float32)/(255.0/2) - 1.0

     data = data.reshape((data.shape[0], 1, data.shape[1], data.shape[2]))

     np.random.seed(1234)
     p = np.random.permutation(data.shape[0])
     X = data[p]

     return X, image_names

 class RandIter(mx.io.DataIter):
     '''Create a random iterator for generator
     '''
     def __init__(self, batch_size, ndim):
         self.batch_size = batch_size
         self.ndim = ndim
         self.provide_data = [('rand', (batch_size, ndim, 1, 1))]
         self.provide_label = []

     def iter_next(self):
         return True

     def getdata(self):
         return [mx.random.normal(0, 1.0, shape=(self.batch_size, self.ndim, 1, 1))]

 def fill_buf(buf, i, img, shape):
     '''fill the ith grid of the buffer matrix with the values from the img
     buf : buffer matrix
     i : serial of the image in the 2D grid
     img : image data
     shape : ( height width depth ) of image'''

     # grid height is a multiple of individual image height
     m = buf.shape[0]/shape[0]

     sx = (i%m)*shape[1]
     sy = (i//m)*shape[0]
     sx = int(sx)
     sy = int(sy)
     buf[sy:sy+shape[0], sx:sx+shape[1], :] = img

 def visual(title, X, activation):
     '''create a grid of images and save it as a final image
     title : grid image name
     X : array of images
     '''
     assert len(X.shape) == 4

     X = X.transpose((0, 2, 3, 1))
     if activation == 'sigmoid':
         X = np.clip((X)*(255.0), 0, 255).astype(np.uint8)
     elif activation == 'tanh':
         X = np.clip((X+1.0)*(255.0/2.0), 0, 255).astype(np.uint8)
     n = np.ceil(np.sqrt(X.shape[0]))
     buff = np.zeros((int(n*X.shape[1]), int(n*X.shape[2]), int(X.shape[3])), dtype=np.uint8)
     for i, img in enumerate(X):
         fill_buf(buff, i, img, X.shape[1:3])
     cv2.imwrite('%s.jpg' % (title), buff)

 def train(dataset, nef, ndf, ngf, nc, batch_size, Z, lr, beta1, epsilon, ctx, check_point, g_dl_weight, output_path, checkpoint_path, data_path, activation,num_epoch, save_after_every, visualize_after_every, show_after_every):
     '''adversarial training of the VAE
     '''

     #encoder
     z_mu, z_lv, z = encoder(nef, Z, batch_size)
     symE = mx.sym.Group([z_mu, z_lv, z])

     #generator
     symG = generator(ngf, nc, no_bias=True, fix_gamma=True, eps=1e-5 + 1e-12, z_dim = Z, activation=activation )

     #discriminator
     h  = discriminator1(ndf)
     dloss  = discriminator2(ndf)
     symD1 = h
     symD2 = dloss


     # ==============data==============
     X_train, _ = get_data(data_path, activation)
     train_iter = mx.io.NDArrayIter(X_train, batch_size=batch_size, shuffle=True)
     rand_iter = RandIter(batch_size, Z)
     label = mx.nd.zeros((batch_size,), ctx=ctx)

     # =============module E=============
     modE = mx.mod.Module(symbol=symE, data_names=('data',), label_names=None, context=ctx)
     modE.bind(data_shapes=train_iter.provide_data)
     modE.init_params(initializer=mx.init.Normal(0.02))
     modE.init_optimizer(
         optimizer='adam',
         optimizer_params={
             'learning_rate': lr,
             'wd': 1e-6,
             'beta1': beta1,
             'epsilon': epsilon,
             'rescale_grad': (1.0/batch_size)
         })
     mods = [modE]

     # =============module G=============
     modG = mx.mod.Module(symbol=symG, data_names=('rand',), label_names=None, context=ctx)
     modG.bind(data_shapes=rand_iter.provide_data, inputs_need_grad=True)
     modG.init_params(initializer=mx.init.Normal(0.02))
     modG.init_optimizer(
         optimizer='adam',
         optimizer_params={
             'learning_rate': lr,
             'wd': 1e-6,
             'beta1': beta1,
             'epsilon': epsilon,
         })
     mods.append(modG)

     # =============module D=============
     modD1 = mx.mod.Module(symD1, label_names=[], context=ctx)
     modD2 = mx.mod.Module(symD2, label_names=('label',), context=ctx)
     modD = mx.mod.SequentialModule()
     modD.add(modD1).add(modD2, take_labels=True, auto_wiring=True)
     modD.bind(data_shapes=train_iter.provide_data,
               label_shapes=[('label', (batch_size,))],
               inputs_need_grad=True)
     modD.init_params(initializer=mx.init.Normal(0.02))
     modD.init_optimizer(
         optimizer='adam',
         optimizer_params={
             'learning_rate': lr,
             'wd': 1e-3,
             'beta1': beta1,
             'epsilon': epsilon,
             'rescale_grad': (1.0/batch_size)
         })
     mods.append(modD)


     # =============module DL=============
     symDL = DiscriminatorLayerLoss()
     modDL = mx.mod.Module(symbol=symDL, data_names=('data',), label_names=('label',), context=ctx)
     modDL.bind(data_shapes=[('data', (batch_size,nef * 4,4,4))], ################################################################################################################################ fix 512 here
               label_shapes=[('label', (batch_size,nef * 4,4,4))],
               inputs_need_grad=True)
     modDL.init_params(initializer=mx.init.Normal(0.02))
     modDL.init_optimizer(
         optimizer='adam',
         optimizer_params={
             'learning_rate': lr,
             'wd': 0.,
             'beta1': beta1,
             'epsilon': epsilon,
             'rescale_grad': (1.0/batch_size)
         })

     # =============module KL=============
     symKL = KLDivergenceLoss()
     modKL = mx.mod.Module(symbol=symKL, data_names=('data',), label_names=None, context=ctx)
     modKL.bind(data_shapes=[('data', (batch_size*2,Z))],
                inputs_need_grad=True)
     modKL.init_params(initializer=mx.init.Normal(0.02))
     modKL.init_optimizer(
         optimizer='adam',
         optimizer_params={
             'learning_rate': lr,
             'wd': 0.,
             'beta1': beta1,
             'epsilon': epsilon,
             'rescale_grad': (1.0/batch_size)
         })
     mods.append(modKL)

     def norm_stat(d):
         return mx.nd.norm(d)/np.sqrt(d.size)
     mon = mx.mon.Monitor(10, norm_stat, pattern=".*output|d1_backward_data", sort=True)
     mon = None
     if mon is not None:
         for mod in mods:
             pass

     def facc(label, pred):
         '''calculating prediction accuracy
         '''
         pred = pred.ravel()
         label = label.ravel()
         return ((pred > 0.5) == label).mean()

     def fentropy(label, pred):
         '''calculating binary cross-entropy loss
         '''
         pred = pred.ravel()
         label = label.ravel()
         return -(label*np.log(pred+1e-12) + (1.-label)*np.log(1.-pred+1e-12)).mean()

     def kldivergence(label, pred):
         '''calculating KL divergence loss
         '''
         mean, log_var = np.split(pred, 2, axis=0)
         var = np.exp(log_var)
         KLLoss = -0.5 * np.sum(1 + log_var - np.power(mean, 2) - var)
         KLLoss = KLLoss / nElements
         return KLLoss

     mG = mx.metric.CustomMetric(fentropy)
     mD = mx.metric.CustomMetric(fentropy)
     mE = mx.metric.CustomMetric(kldivergence)
     mACC = mx.metric.CustomMetric(facc)

     print('Training...')
     stamp =  datetime.now().strftime('%Y_%m_%d-%H_%M')

     # =============train===============
     for epoch in range(num_epoch):
         train_iter.reset()
         for t, batch in enumerate(train_iter):

             rbatch = rand_iter.next()

             if mon is not None:
                 mon.tic()

             modG.forward(rbatch, is_train=True)
             outG = modG.get_outputs()

             # update discriminator on fake
             label[:] = 0
             modD.forward(mx.io.DataBatch(outG, [label]), is_train=True)
             modD.backward()
             gradD11 = [[grad.copyto(grad.context) for grad in grads] for grads in modD1._exec_group.grad_arrays]
             gradD12 = [[grad.copyto(grad.context) for grad in grads] for grads in modD2._exec_group.grad_arrays]

             modD.update_metric(mD, [label])
             modD.update_metric(mACC, [label])


             #update discriminator on decoded
             modE.forward(batch, is_train=True)
             mu, lv, z = modE.get_outputs()
             z = z.reshape((batch_size, Z, 1, 1))
             sample = mx.io.DataBatch([z], label=None, provide_data = [('rand', (batch_size, Z, 1, 1))])
             modG.forward(sample, is_train=True)
             xz = modG.get_outputs()
             label[:] = 0
             modD.forward(mx.io.DataBatch(xz, [label]), is_train=True)
             modD.backward()

             #modD.update()
             gradD21 = [[grad.copyto(grad.context) for grad in grads] for grads in modD1._exec_group.grad_arrays]
             gradD22 = [[grad.copyto(grad.context) for grad in grads] for grads in modD2._exec_group.grad_arrays]
             modD.update_metric(mD, [label])
             modD.update_metric(mACC, [label])

             # update discriminator on real
             label[:] = 1
             batch.label = [label]
             modD.forward(batch, is_train=True)
             lx = [out.copyto(out.context) for out in modD1.get_outputs()]
             modD.backward()
             for gradsr, gradsf, gradsd in zip(modD1._exec_group.grad_arrays, gradD11, gradD21):
                 for gradr, gradf, gradd in zip(gradsr, gradsf, gradsd):
                     gradr += 0.5 * (gradf + gradd)
             for gradsr, gradsf, gradsd in zip(modD2._exec_group.grad_arrays, gradD12, gradD22):
                 for gradr, gradf, gradd in zip(gradsr, gradsf, gradsd):
                     gradr += 0.5 * (gradf + gradd)

             modD.update()
             modD.update_metric(mD, [label])
             modD.update_metric(mACC, [label])

             modG.forward(rbatch, is_train=True)
             outG = modG.get_outputs()
             label[:] = 1
             modD.forward(mx.io.DataBatch(outG, [label]), is_train=True)
             modD.backward()
             diffD = modD1.get_input_grads()
             modG.backward(diffD)
             gradG1 = [[grad.copyto(grad.context) for grad in grads] for grads in modG._exec_group.grad_arrays]
             mG.update([label], modD.get_outputs())

             modG.forward(sample, is_train=True)
             xz = modG.get_outputs()
             label[:] = 1
             modD.forward(mx.io.DataBatch(xz, [label]), is_train=True)
             modD.backward()
             diffD = modD1.get_input_grads()
             modG.backward(diffD)
             gradG2 = [[grad.copyto(grad.context) for grad in grads] for grads in modG._exec_group.grad_arrays]
             mG.update([label], modD.get_outputs())

             modG.forward(sample, is_train=True)
             xz = modG.get_outputs()
             modD1.forward(mx.io.DataBatch(xz, []), is_train=True)
             outD1 = modD1.get_outputs()
             modDL.forward(mx.io.DataBatch(outD1, lx), is_train=True)
             modDL.backward()
             dlGrad = modDL.get_input_grads()
             modD1.backward(dlGrad)
             diffD = modD1.get_input_grads()
             modG.backward(diffD)

             for grads, gradsG1, gradsG2 in zip(modG._exec_group.grad_arrays, gradG1, gradG2):
                 for grad, gradg1, gradg2 in zip(grads, gradsG1, gradsG2):
                     grad = g_dl_weight * grad + 0.5 * (gradg1 + gradg2)

             modG.update()
             mG.update([label], modD.get_outputs())

             modG.forward(rbatch, is_train=True)
             outG = modG.get_outputs()
             label[:] = 1
             modD.forward(mx.io.DataBatch(outG, [label]), is_train=True)
             modD.backward()
             diffD = modD1.get_input_grads()
             modG.backward(diffD)
             gradG1 = [[grad.copyto(grad.context) for grad in grads] for grads in modG._exec_group.grad_arrays]
             mG.update([label], modD.get_outputs())

             modG.forward(sample, is_train=True)
             xz = modG.get_outputs()
             label[:] = 1
             modD.forward(mx.io.DataBatch(xz, [label]), is_train=True)
             modD.backward()
             diffD = modD1.get_input_grads()
             modG.backward(diffD)
             gradG2 = [[grad.copyto(grad.context) for grad in grads] for grads in modG._exec_group.grad_arrays]
             mG.update([label], modD.get_outputs())

             modG.forward(sample, is_train=True)
             xz = modG.get_outputs()
             modD1.forward(mx.io.DataBatch(xz, []), is_train=True)
             outD1 = modD1.get_outputs()
             modDL.forward(mx.io.DataBatch(outD1, lx), is_train=True)
             modDL.backward()
             dlGrad = modDL.get_input_grads()
             modD1.backward(dlGrad)
             diffD = modD1.get_input_grads()
             modG.backward(diffD)

             for grads, gradsG1, gradsG2 in zip(modG._exec_group.grad_arrays, gradG1, gradG2):
                 for grad, gradg1, gradg2 in zip(grads, gradsG1, gradsG2):
                     grad = g_dl_weight * grad + 0.5 * (gradg1 + gradg2)

             modG.update()
             mG.update([label], modD.get_outputs())

             modG.forward(sample, is_train=True)
             xz = modG.get_outputs()

             #update generator
             modD1.forward(mx.io.DataBatch(xz, []), is_train=True)
             outD1 = modD1.get_outputs()
             modDL.forward(mx.io.DataBatch(outD1, lx), is_train=True)
             DLloss = modDL.get_outputs()
             modDL.backward()
             dlGrad = modDL.get_input_grads()
             modD1.backward(dlGrad)
             diffD = modD1.get_input_grads()
             modG.backward(diffD)
             #update encoder
             nElements = batch_size
             modKL.forward(mx.io.DataBatch([mx.ndarray.concat(mu,lv, dim=0)]), is_train=True)
             KLloss = modKL.get_outputs()
             modKL.backward()
             gradKLLoss = modKL.get_input_grads()
             diffG = modG.get_input_grads()
             diffG = diffG[0].reshape((batch_size, Z))
             modE.backward(mx.ndarray.split(gradKLLoss[0], num_outputs=2, axis=0) + [diffG])
             modE.update()
             pred = mx.ndarray.concat(mu,lv, dim=0)
             mE.update([pred], [pred])
             if mon is not None:
                 mon.toc_print()

             t += 1
             if t % show_after_every == 0:
                 print('epoch:', epoch, 'iter:', t, 'metric:', mACC.get(), mG.get(), mD.get(), mE.get(), KLloss[0].asnumpy(), DLloss[0].asnumpy())
                 mACC.reset()
                 mG.reset()
                 mD.reset()
                 mE.reset()

             if epoch % visualize_after_every == 0:
                 visual(output_path +'gout'+str(epoch), outG[0].asnumpy(), activation)
                 visual(output_path + 'data'+str(epoch), batch.data[0].asnumpy(), activation)

         if check_point and epoch % save_after_every == 0:
             print('Saving...')
             modG.save_params(checkpoint_path + '/%s_G-%04d.params'%(dataset, epoch))
             modD.save_params(checkpoint_path + '/%s_D-%04d.params'%(dataset, epoch))
             modE.save_params(checkpoint_path + '/%s_E-%04d.params'%(dataset, epoch))

 def test(nef, ngf, nc, batch_size, Z, ctx, pretrained_encoder_path, pretrained_generator_path, output_path, data_path, activation, save_embedding, embedding_path = ''):
     '''Test the VAE with a pretrained encoder and generator.
     Keep the batch size 1'''
     #encoder
     z_mu, z_lv, z = encoder(nef, Z, batch_size)
     symE = mx.sym.Group([z_mu, z_lv, z])

     #generator
     symG = generator(ngf, nc, no_bias=True, fix_gamma=True, eps=1e-5 + 1e-12, z_dim = Z, activation=activation )

     # ==============data==============
     X_test, image_names = get_data(data_path, activation)
     test_iter = mx.io.NDArrayIter(X_test, batch_size=batch_size, shuffle=False)

     # =============module E=============
     modE = mx.mod.Module(symbol=symE, data_names=('data',), label_names=None, context=ctx)
     modE.bind(data_shapes=test_iter.provide_data)
     modE.load_params(pretrained_encoder_path)

     # =============module G=============
     modG = mx.mod.Module(symbol=symG, data_names=('rand',), label_names=None, context=ctx)
     modG.bind(data_shapes=[('rand', (1, Z, 1, 1))])
     modG.load_params(pretrained_generator_path)

     print('Testing...')

     # =============test===============
     test_iter.reset()
     for t, batch in enumerate(test_iter):

         #update discriminator on decoded
         modE.forward(batch, is_train=False)
         mu, lv, z = modE.get_outputs()
         mu = mu.reshape((batch_size, Z, 1, 1))
         sample = mx.io.DataBatch([mu], label=None, provide_data = [('rand', (batch_size, Z, 1, 1))])
         modG.forward(sample, is_train=False)
         outG = modG.get_outputs()

         visual(output_path + '/' + 'gout'+str(t), outG[0].asnumpy(), activation)
         visual(output_path +  '/' + 'data'+str(t), batch.data[0].asnumpy(), activation)
         image_name = image_names[t].split('.')[0]

         if save_embedding:
             savemat(embedding_path+'/'+image_name+'.mat', {'embedding':mu.asnumpy()})

 def create_and_validate_dir(data_dir):
     '''Creates/Validates dir
     '''
     if data_dir != "":
         if not os.path.exists(data_dir):
             try:
                 logging.info('create directory %s', data_dir)
                 os.makedirs(data_dir)
             except OSError as exc:
                 if exc.errno != errno.EEXIST:
                     raise OSError('failed to create ' + data_dir)


 def parse_args():
     '''Parse args
     '''
     parser = argparse.ArgumentParser(description='Train and Test an Adversarial Variatiional Encoder')

     parser.add_argument('--train', help='train the network', action='store_true')
     parser.add_argument('--test', help='test the network', action='store_true')
     parser.add_argument('--save_embedding', help='saves the shape embedding of each input image', action='store_true')
     parser.add_argument('--dataset', help='dataset name', default='caltech', type=str)
     parser.add_argument('--activation', help='activation i.e. sigmoid or tanh', default='sigmoid', type=str)
     parser.add_argument('--training_data_path', help='training data path', default='datasets/caltech101/data/images32x32', type=str)
     parser.add_argument('--testing_data_path', help='testing data path', default='datasets/caltech101/test_data', type=str)
     parser.add_argument('--pretrained_encoder_path', help='pretrained encoder model path', default='checkpoints32x32_sigmoid/caltech_E-0045.params', type=str)
     parser.add_argument('--pretrained_generator_path', help='pretrained generator model path', default='checkpoints32x32_sigmoid/caltech_G-0045.params', type=str)
     parser.add_argument('--output_path', help='output path for the generated images', default='outputs32x32_sigmoid', type=str)
     parser.add_argument('--embedding_path', help='output path for the generated embeddings', default='outputs32x32_sigmoid', type=str)
     parser.add_argument('--checkpoint_path', help='checkpoint saving path ', default='checkpoints32x32_sigmoid', type=str)
     parser.add_argument('--nef', help='encoder filter count in the first layer', default=64, type=int)
     parser.add_argument('--ndf', help='discriminator filter count in the first layer', default=64, type=int)
     parser.add_argument('--ngf', help='generator filter count in the second last layer', default=64, type=int)
     parser.add_argument('--nc', help='generator filter count in the last layer i.e. 1 for grayscale image, 3 for RGB image', default=1, type=int)
     parser.add_argument('--batch_size', help='batch size, keep it 1 during testing', default=64, type=int)
     parser.add_argument('--Z', help='embedding size', default=100, type=int)
     parser.add_argument('--lr', help='learning rate', default=0.0002, type=float)
     parser.add_argument('--beta1', help='beta1 for adam optimizer', default=0.5, type=float)
     parser.add_argument('--epsilon', help='epsilon for adam optimizer', default=1e-5, type=float)
     parser.add_argument('--g_dl_weight', help='discriminator layer loss weight', default=1e-1, type=float)
     parser.add_argument('--gpu', help='gpu index', default=0, type=int)
     parser.add_argument('--use_cpu', help='use cpu', action='store_true')
     parser.add_argument('--num_epoch', help='number of maximum epochs ', default=45, type=int)
     parser.add_argument('--save_after_every', help='save checkpoint after every this number of epochs ', default=5, type=int)
     parser.add_argument('--visualize_after_every', help='save output images after every this number of epochs', default=5, type=int)
     parser.add_argument('--show_after_every', help='show metrics after this number of iterations', default=10, type=int)

     args = parser.parse_args()
     return args

 def main():
     args = parse_args()

     if args.test and not os.path.exists(args.testing_data_path):
         if not os.path.exists(args.testing_data_path):
             raise OSError("Provided Testing Path: {} does not exist".format(args.testing_data_path))
         if not os.path.exists(args.checkpoint_path):
             raise OSError("Provided Checkpoint Path: {} does not exist".format(args.checkpoint_path))

     create_and_validate_dir(args.checkpoint_path)
     create_and_validate_dir(args.output_path)

     # gpu context
     if args.use_cpu:
         ctx = mx.cpu()
     else:
         ctx = mx.gpu(args.gpu)

     # checkpoint saving flags
     check_point = True

     if args.train:
         train(args.dataset, args.nef, args.ndf, args.ngf, args.nc, args.batch_size, args.Z, args.lr, args.beta1, args.epsilon, ctx, check_point, args.g_dl_weight, args.output_path, args.checkpoint_path, args.training_data_path, args.activation, args.num_epoch, args.save_after_every, args.visualize_after_every, args.show_after_every)

     if args.test:
         test(args.nef, args.ngf, args.nc, 1, args.Z, ctx, args.pretrained_encoder_path, args.pretrained_generator_path, args.output_path, args.testing_data_path, args.activation, args.save_embedding, args.embedding_path)

 if __name__ == '__main__':
     logging.basicConfig(level=logging.DEBUG)
     main()