R-package/R/optimizer.R - mxnet-test - Git at Google

 #' Create an SGD optimizer with respective parameters.
 #' Perform SGD with momentum update
 #'
 mx.opt.sgd <- function(learning.rate,
                        momentum=0,
                        wd=0,
                        rescale.grad=1,
                        clip_gradient = NULL,
                        lr_scheduler = NULL) {
   # use lr as short for learing rate.
   lr <- learning.rate
   count       <- 0
   num_update  <- 0

   sgd <- new.env()
   sgd$lr <- lr
   sgd$count <- 0
   sgd$num_update <- 0

   create.state <- function(index, weight) {
     if (momentum == 0) {
       return(NULL)
     } else {
       ret <- (mx.nd.zeros(dim(weight), ctx(weight)))
       return(ret)
     }
   }
   update <- function(index, weight, grad, state) {

     if (!is.null(lr_scheduler)){
       lr_scheduler(sgd) ## changing lr
       lr <- sgd$lr
       ## update count
       indexKey <- paste0('ik', index)
       if (!exists(envir = sgd, x = indexKey, inherits = FALSE)){
         sgd[[indexKey]] <- 0
       } else {
         indexValue <- sgd[[indexKey]]
         sgd[[indexKey]] <- indexValue + 1
         sgd$num_update <- max(sgd$num_update, sgd[[indexKey]])
       }
     }
     grad <- grad * rescale.grad
     if (!is.null(clip_gradient)){
       if(clip_gradient >= 0){
           grad_ctx <- ctx(grad)
           grad <- as.array(grad)
           grad <- pmax(grad, -1 * clip_gradient)
           grad <- pmin(grad, clip_gradient)
           grad <- mx.nd.array(grad, grad_ctx)
       } else {
         stop("Error: clip_gradient should be positive number.")
       }
     }
     if (is.null(state)) {
       weight <- weight - lr * (grad + wd * weight)
     } else {
       mom <- state
       mom <- mom * momentum
       mom <- mom - lr * (grad + wd * weight)
       weight <- weight + mom
       state <- mom
     }
     return(list(weight=weight, state=state))
   }
   return(list(create.state=create.state, update=update))
 }

 #' Create an RMSProp optimizer with respective parameters.
 #' Reference: Tieleman T, Hinton G. Lecture 6.5-rmsprop: Divide the gradient by a running average of its recent magnitude[J]. COURSERA: Neural Networks for Machine Learning, 2012, 4(2).
 #' The code follows: http://arxiv.org/pdf/1308.0850v5.pdf Eq(38) - Eq(45) by Alex Graves, 2013.
 #'
 #' @param learning.rate float, default=0.002
 #'      Step size.
 #' @param gamma1 float, default=0.95
 #'      decay factor of moving average for gradient, gradient^2.
 #' @param gamm2 float, default=0.9
 #'      "momentum" factor.
 #' @param wd float, default=0.0
 #'      L2 regularization coefficient add to all the weights.
 #' @param rescale.grad float, default=1.0
 #'      rescaling factor of gradient.
 #' @param clip_gradient float, optional
 #'      clip gradient in range [-clip_gradient, clip_gradient].
 #' @param lr_scheduler function, optional
 #'      The learning rate scheduler.
 #'
 mx.opt.rmsprop <- function(learning.rate=0.002,
                            gamma1=0.95,
                            gamma2=0.9,
                            wd=0,
                            rescale.grad=1,
                            clip_gradient = NULL,
                            lr_scheduler = NULL) {
   # use lr as short for learing rate.
   lr <- learning.rate
   count       <- 0
   num_update  <- 0

   rmsprop <- new.env()
   rmsprop$lr <- lr
   rmsprop$count <- 0
   rmsprop$num_update <- 0

   create.state <- function(index, weight) {
       return (list(n=mx.nd.zeros(dim(weight), ctx(weight)),
                    g=mx.nd.zeros(dim(weight), ctx(weight)),
                    delta=mx.nd.zeros(dim(weight), ctx(weight))))
   }

   update <- function(index, weight, grad, state) {
     if (!is.null(lr_scheduler)){
       lr_scheduler(rmsprop) ## changing lr
       lr <- rmsprop$lr
       ## update count
       indexKey <- paste0('ik', index)
       if (!exists(envir = rmsprop, x = indexKey, inherits = FALSE)){
         rmsprop[[indexKey]] <- 0
       } else {
         indexValue <- rmsprop[[indexKey]]
         rmsprop[[indexKey]] <- indexValue + 1
         rmsprop$num_update <- max(rmsprop$num_update, rmsprop[[indexKey]])
       }
     }
     grad <- grad * rescale.grad
     if (!is.null(clip_gradient)){
       if(clip_gradient >= 0){
           grad_ctx <- ctx(grad)
           grad <- as.array(grad)
           grad <- pmax(grad, -1 * clip_gradient)
           grad <- pmin(grad, clip_gradient)
           grad <- mx.nd.array(grad, grad_ctx)
       } else {
         stop("Error: clip_gradient should be positive number.")
       }
     }

     n <- state$n
     g <- state$g
     delta <- state$delta
     n <- gamma1 * n + (1 - gamma1) * (grad * grad)
     g <- gamma1 * g + (1 - gamma1) * grad
     delta <- gamma2 * delta - lr * (grad / mx.nd.sqrt(n - g*g + 1e-4) + wd * weight)
     weight <- weight + delta
     state <- list(n=n, g=g, delta=delta)

     return(list(weight=weight, state=state))
   }
   return(list(create.state=create.state, update=update))
 }

 #' Create an Adam optimizer with respective parameters.
 #' Adam optimizer as described in [King2014].
 #'
 #' [King2014] Diederik Kingma, Jimmy Ba,
 #' Adam: A Method for Stochastic Optimization,
 #' http://arxiv.org/abs/1412.6980
 #'
 #' @param learning.rate float, default=0.001
 #'      Step size.
 #' @param beta1 float, default=0.9
 #'      Exponential decay rate for the first moment estimates.
 #' @param beta2 float, default=0.999
 #'      Exponential decay rate for the second moment estimates.
 #' @param epsilon float, default=1e-8
 #' @param wd float, default=0.0
 #'      L2 regularization coefficient add to all the weights.
 #' @param rescale.grad float, default=1.0
 #'      rescaling factor of gradient.
 #' @param clip_gradient float, optional
 #'      clip gradient in range [-clip_gradient, clip_gradient].
 #' @param lr_scheduler function, optional
 #'      The learning rate scheduler.
 #'
 mx.opt.adam <- function(learning.rate=0.001,
                         beta1=0.9,
                         beta2=0.999,
                         epsilon=1e-8,
                         wd=0,
                         rescale.grad=1,
                         clip_gradient = NULL,
                         lr_scheduler = NULL) {
   # use lr as short for learing rate.
   lr <- learning.rate
   count       <- 0
   num_update  <- 0

   adam <- new.env()
   adam$lr <- lr
   adam$count <- 0
   adam$num_update <- 0

   create.state <- function(index, weight) {
       return (list(mean=mx.nd.zeros(dim(weight), ctx(weight)),
                    variance=mx.nd.zeros(dim(weight), ctx(weight))))
   }

   update <- function(index, weight, grad, state) {
     if (!is.null(lr_scheduler)){
       lr_scheduler(adam) ## changing lr
       lr <- adam$lr
       ## update count
       indexKey <- paste0('ik', index)
       if (!exists(envir = adam, x = indexKey, inherits = FALSE)){
         adam[[indexKey]] <- 0
       } else {
         indexValue <- adam[[indexKey]]
         adam[[indexKey]] <- indexValue + 1
         adam$num_update <- max(adam$num_update, adam[[indexKey]])
       }
     }

     # increment time
     time.key <- paste0('t', index)
     if (!exists(envir = adam, x = time.key, inherits = FALSE)){
       adam[[time.key]] <- 0
     }
     t <- adam[[time.key]]
     t <- t + 1
     adam[[time.key]] <- t

     mean <- state$mean
     variance <- state$variance

     grad <- grad * rescale.grad
     if (!is.null(clip_gradient)){
       if(clip_gradient >= 0){
           grad_ctx <- ctx(grad)
           grad <- as.array(grad)
           grad <- pmax(grad, -1 * clip_gradient)
           grad <- pmin(grad, clip_gradient)
           grad <- mx.nd.array(grad, grad_ctx)
       } else {
         stop("Error: clip_gradient should be positive number.")
       }
     }

     mean <- beta1 * mean + (1 - beta1) * grad
     variance <- beta2 * variance + (1 - beta2) * (grad * grad)

     coef1 <- 1 - beta1^t
     coef2 <- 1 - beta2^t
     lr <- lr * sqrt(coef2)/coef1

     weight <- weight - lr * mean / (mx.nd.sqrt(variance) + epsilon)
     weight <- weight - lr * wd * weight

     state <- list(mean=mean, variance=variance)

     return(list(weight=weight, state=state))
   }
   return(list(create.state=create.state, update=update))
 }

 #' Create an AdaGrad optimizer with respective parameters.
 #' AdaGrad optimizer of Duchi et al., 2011,
 #'
 #' This code follows the version in http://arxiv.org/pdf/1212.5701v1.pdf  Eq(5)
 #' by Matthew D. Zeiler, 2012. AdaGrad will help the network to converge faster
 #' in some cases.
 #'
 #' @param learning.rate float, default=0.05
 #'      Step size.
 #' @param epsilon float, default=1e-8
 #' @param wd float, default=0.0
 #'      L2 regularization coefficient add to all the weights.
 #' @param rescale.grad float, default=1.0
 #'      rescaling factor of gradient.
 #' @param clip_gradient float, optional
 #'      clip gradient in range [-clip_gradient, clip_gradient].
 #' @param lr_scheduler function, optional
 #'      The learning rate scheduler.
 #'
 mx.opt.adagrad <- function(learning.rate=0.05,
                            epsilon=1e-8,
                            wd=0,
                            rescale.grad=1,
                            clip_gradient = NULL,
                            lr_scheduler = NULL) {
   # use lr as short for learing rate.
   lr <- learning.rate
   count       <- 0
   num_update  <- 0

   adagrad <- new.env()
   adagrad$lr <- lr
   adagrad$count <- 0
   adagrad$num_update <- 0

   create.state <- function(index, weight) {
       return (mx.nd.zeros(dim(weight), ctx(weight))) #history
   }

   update <- function(index, weight, grad, state) {
     if (!is.null(lr_scheduler)){
       lr_scheduler(adagrad) ## changing lr
       lr <- adagrad$lr
       ## update count
       indexKey <- paste0('ik', index)
       if (!exists(envir = adagrad, x = indexKey, inherits = FALSE)){
         adagrad[[indexKey]] <- 0
       } else {
         indexValue <- adagrad[[indexKey]]
         adagrad[[indexKey]] <- indexValue + 1
         adagrad$num_update <- max(adagrad$num_update, adagrad[[indexKey]])
       }
     }

     grad <- grad * rescale.grad
     if (!is.null(clip_gradient)){
       if(clip_gradient >= 0){
           grad_ctx <- ctx(grad)
           grad <- as.array(grad)
           grad <- pmax(grad, -1 * clip_gradient)
           grad <- pmin(grad, clip_gradient)
           grad <- mx.nd.array(grad, grad_ctx)
       } else {
         stop("Error: clip_gradient should be positive number.")
       }
     }

     history <- state
     history <- history + (grad * grad)
     weight <- weight - lr * (grad / mx.nd.sqrt(history + epsilon) + wd * weight)
     state <- history

     return(list(weight=weight, state=state))
   }
   return(list(create.state=create.state, update=update))
 }

 #' Create an AdaDelta optimizer with respective parameters.
 #'
 #' AdaDelta optimizer as described in Zeiler, M. D. (2012).
 #' *ADADELTA: An adaptive learning rate method.*
 #' http://arxiv.org/abs/1212.5701
 #'
 #' @param rho float, default=0.90
 #'      Decay rate for both squared gradients and delta x.
 #' @param epsilon float, default=1e-5
 #'      The constant as described in the thesis.
 #' @param wd float, default=0.0
 #'      L2 regularization coefficient add to all the weights.
 #' @param rescale.grad float, default=1.0
 #'      rescaling factor of gradient.
 #' @param clip_gradient float, optional
 #'      clip gradient in range [-clip_gradient, clip_gradient].
 #'
 mx.opt.adadelta <- function(rho=0.90,
                             epsilon=1e-5,
                             wd=0,
                             rescale.grad=1,
                             clip_gradient = NULL) {
   adadelta <- new.env()

   create.state <- function(index, weight) {
     return (list(acc.g=mx.nd.zeros(dim(weight), ctx(weight)),       # accumulated g
                  acc.delta=mx.nd.zeros(dim(weight), ctx(weight))))  # accumulated delta
   }

   update <- function(index, weight, grad, state) {
     # preprocess grad
     grad <- grad * rescale.grad
     if (!is.null(clip_gradient)){
       if(clip_gradient >= 0){
           grad_ctx <- ctx(grad)
           grad <- as.array(grad)
           grad <- pmax(grad, -1 * clip_gradient)
           grad <- pmin(grad, clip_gradient)
           grad <- mx.nd.array(grad, grad_ctx)
       } else {
         stop("Error: clip_gradient should be positive number.")
       }
     }

     # accumulated g and delta initlization
     acc.g <- state$acc.g
     acc.delta <- state$acc.delta

     # update g, delta
     acc.g <- rho * acc.g + (1 - rho) * (grad * grad)
     current.delta <- mx.nd.sqrt(acc.delta + epsilon) / mx.nd.sqrt(acc.g + epsilon) * grad
     acc.delta <- rho * acc.delta + (1 - rho) * (current.delta * current.delta)
     weight <- weight - current.delta - wd * weight
     state <- list(acc.g=acc.g, acc.delta=acc.delta)

     return(list(weight=weight, state=state))
   }
   return(list(create.state=create.state, update=update))
 }

 #' Create an optimizer by name and parameters
 #'
 #' @param name The name of the optimizer
 #' @param ... Additional arguments
 #'
 #' @export
 mx.opt.create <- function(name, ...) {
   if (name == "sgd") {
     return(mx.opt.sgd(...))
   }
   else if (name == "rmsprop") {
     return (mx.opt.rmsprop(...))
   }
   else if (name == "adam") {
     return (mx.opt.adam(...))
   }
   else if (name == "adagrad") {
     return (mx.opt.adagrad(...))
   }
   else if (name == "adadelta") {
     return (mx.opt.adadelta(...))
   }
   stop(paste("Unknown optimizer ", name))
 }

 #' Get an updater closure that can take list of weight and gradient
 #' and return updated list of weight.
 #'
 #' @param optimizer The optimizer
 #' @param weights The weights to be optimized
 #'
 #' @export
 mx.opt.get.updater <- function(optimizer, weights) {
   n <- length(weights)
   # This is the list to keep track of internal states of optimzer
   state.list <- lapply(1:n, function(i) {
     if (is.null(weights[[i]])) return(NULL)
     optimizer$create.state(i, weights[[i]])
   })
   update <- optimizer$update

   update.closure <- function(weight, grad) {
     ulist <- lapply(1:n, function(i) {
       if (!is.null(grad[[i]])) {
         update(i, weight[[i]], grad[[i]], state.list[[i]])
       } else {
         return(NULL)
       }
     })
     # update state list, use mutate assignment
     state.list <<- lapply(ulist, function(x) {
       x$state
     })
     # return updated weight list
     weight.list <- lapply(ulist, function(x) {
       x$weight
     })
     return(weight.list)
   }
   return(update.closure)
 }
	#' Create an SGD optimizer with respective parameters.
	#' Perform SGD with momentum update
	#'
	mx.opt.sgd <- function(learning.rate,
	momentum=0,
	wd=0,
	rescale.grad=1,
	clip_gradient = NULL,
	lr_scheduler = NULL) {
	# use lr as short for learing rate.
	lr <- learning.rate
	count <- 0
	num_update <- 0

	sgd <- new.env()
	sgd$lr <- lr
	sgd$count <- 0
	sgd$num_update <- 0

	create.state <- function(index, weight) {
	if (momentum == 0) {
	return(NULL)
	} else {
	ret <- (mx.nd.zeros(dim(weight), ctx(weight)))
	return(ret)
	}
	}
	update <- function(index, weight, grad, state) {

	if (!is.null(lr_scheduler)){
	lr_scheduler(sgd) ## changing lr
	lr <- sgd$lr
	## update count
	indexKey <- paste0('ik', index)
	if (!exists(envir = sgd, x = indexKey, inherits = FALSE)){
	sgd[[indexKey]] <- 0
	} else {
	indexValue <- sgd[[indexKey]]
	sgd[[indexKey]] <- indexValue + 1
	sgd$num_update <- max(sgd$num_update, sgd[[indexKey]])
	}
	}
	grad <- grad * rescale.grad
	if (!is.null(clip_gradient)){
	if(clip_gradient >= 0){
	grad_ctx <- ctx(grad)
	grad <- as.array(grad)
	grad <- pmax(grad, -1 * clip_gradient)
	grad <- pmin(grad, clip_gradient)
	grad <- mx.nd.array(grad, grad_ctx)
	} else {
	stop("Error: clip_gradient should be positive number.")
	}
	}
	if (is.null(state)) {
	weight <- weight - lr * (grad + wd * weight)
	} else {
	mom <- state
	mom <- mom * momentum
	mom <- mom - lr * (grad + wd * weight)
	weight <- weight + mom
	state <- mom
	}
	return(list(weight=weight, state=state))
	}
	return(list(create.state=create.state, update=update))
	}

	#' Create an RMSProp optimizer with respective parameters.
	#' Reference: Tieleman T, Hinton G. Lecture 6.5-rmsprop: Divide the gradient by a running average of its recent magnitude[J]. COURSERA: Neural Networks for Machine Learning, 2012, 4(2).
	#' The code follows: http://arxiv.org/pdf/1308.0850v5.pdf Eq(38) - Eq(45) by Alex Graves, 2013.
	#'
	#' @param learning.rate float, default=0.002
	#' Step size.
	#' @param gamma1 float, default=0.95
	#' decay factor of moving average for gradient, gradient^2.
	#' @param gamm2 float, default=0.9
	#' "momentum" factor.
	#' @param wd float, default=0.0
	#' L2 regularization coefficient add to all the weights.
	#' @param rescale.grad float, default=1.0
	#' rescaling factor of gradient.
	#' @param clip_gradient float, optional
	#' clip gradient in range [-clip_gradient, clip_gradient].
	#' @param lr_scheduler function, optional
	#' The learning rate scheduler.
	#'
	mx.opt.rmsprop <- function(learning.rate=0.002,
	gamma1=0.95,
	gamma2=0.9,
	wd=0,
	rescale.grad=1,
	clip_gradient = NULL,
	lr_scheduler = NULL) {
	# use lr as short for learing rate.
	lr <- learning.rate
	count <- 0
	num_update <- 0

	rmsprop <- new.env()
	rmsprop$lr <- lr
	rmsprop$count <- 0
	rmsprop$num_update <- 0

	create.state <- function(index, weight) {
	return (list(n=mx.nd.zeros(dim(weight), ctx(weight)),
	g=mx.nd.zeros(dim(weight), ctx(weight)),
	delta=mx.nd.zeros(dim(weight), ctx(weight))))
	}

	update <- function(index, weight, grad, state) {
	if (!is.null(lr_scheduler)){
	lr_scheduler(rmsprop) ## changing lr
	lr <- rmsprop$lr
	## update count
	indexKey <- paste0('ik', index)
	if (!exists(envir = rmsprop, x = indexKey, inherits = FALSE)){
	rmsprop[[indexKey]] <- 0
	} else {
	indexValue <- rmsprop[[indexKey]]
	rmsprop[[indexKey]] <- indexValue + 1
	rmsprop$num_update <- max(rmsprop$num_update, rmsprop[[indexKey]])
	}
	}
	grad <- grad * rescale.grad
	if (!is.null(clip_gradient)){
	if(clip_gradient >= 0){
	grad_ctx <- ctx(grad)
	grad <- as.array(grad)
	grad <- pmax(grad, -1 * clip_gradient)
	grad <- pmin(grad, clip_gradient)
	grad <- mx.nd.array(grad, grad_ctx)
	} else {
	stop("Error: clip_gradient should be positive number.")
	}
	}

	n <- state$n
	g <- state$g
	delta <- state$delta
	n <- gamma1 * n + (1 - gamma1) * (grad * grad)
	g <- gamma1 * g + (1 - gamma1) * grad
	delta <- gamma2 * delta - lr * (grad / mx.nd.sqrt(n - gg + 1e-4) + wd weight)
	weight <- weight + delta
	state <- list(n=n, g=g, delta=delta)

	return(list(weight=weight, state=state))
	}
	return(list(create.state=create.state, update=update))
	}

	#' Create an Adam optimizer with respective parameters.
	#' Adam optimizer as described in [King2014].
	#'
	#' [King2014] Diederik Kingma, Jimmy Ba,
	#' Adam: A Method for Stochastic Optimization,
	#' http://arxiv.org/abs/1412.6980
	#'
	#' @param learning.rate float, default=0.001
	#' Step size.
	#' @param beta1 float, default=0.9
	#' Exponential decay rate for the first moment estimates.
	#' @param beta2 float, default=0.999
	#' Exponential decay rate for the second moment estimates.
	#' @param epsilon float, default=1e-8
	#' @param wd float, default=0.0
	#' L2 regularization coefficient add to all the weights.
	#' @param rescale.grad float, default=1.0
	#' rescaling factor of gradient.
	#' @param clip_gradient float, optional
	#' clip gradient in range [-clip_gradient, clip_gradient].
	#' @param lr_scheduler function, optional
	#' The learning rate scheduler.
	#'
	mx.opt.adam <- function(learning.rate=0.001,
	beta1=0.9,
	beta2=0.999,
	epsilon=1e-8,
	wd=0,
	rescale.grad=1,
	clip_gradient = NULL,
	lr_scheduler = NULL) {
	# use lr as short for learing rate.
	lr <- learning.rate
	count <- 0
	num_update <- 0

	adam <- new.env()
	adam$lr <- lr
	adam$count <- 0
	adam$num_update <- 0

	create.state <- function(index, weight) {
	return (list(mean=mx.nd.zeros(dim(weight), ctx(weight)),
	variance=mx.nd.zeros(dim(weight), ctx(weight))))
	}

	update <- function(index, weight, grad, state) {
	if (!is.null(lr_scheduler)){
	lr_scheduler(adam) ## changing lr
	lr <- adam$lr
	## update count
	indexKey <- paste0('ik', index)
	if (!exists(envir = adam, x = indexKey, inherits = FALSE)){
	adam[[indexKey]] <- 0
	} else {
	indexValue <- adam[[indexKey]]
	adam[[indexKey]] <- indexValue + 1
	adam$num_update <- max(adam$num_update, adam[[indexKey]])
	}
	}

	# increment time
	time.key <- paste0('t', index)
	if (!exists(envir = adam, x = time.key, inherits = FALSE)){
	adam[[time.key]] <- 0
	}
	t <- adam[[time.key]]
	t <- t + 1
	adam[[time.key]] <- t

	mean <- state$mean
	variance <- state$variance

	grad <- grad * rescale.grad
	if (!is.null(clip_gradient)){
	if(clip_gradient >= 0){
	grad_ctx <- ctx(grad)
	grad <- as.array(grad)
	grad <- pmax(grad, -1 * clip_gradient)
	grad <- pmin(grad, clip_gradient)
	grad <- mx.nd.array(grad, grad_ctx)
	} else {
	stop("Error: clip_gradient should be positive number.")
	}
	}

	mean <- beta1 * mean + (1 - beta1) * grad
	variance <- beta2 * variance + (1 - beta2) * (grad * grad)

	coef1 <- 1 - beta1^t
	coef2 <- 1 - beta2^t
	lr <- lr * sqrt(coef2)/coef1

	weight <- weight - lr * mean / (mx.nd.sqrt(variance) + epsilon)
	weight <- weight - lr * wd * weight

	state <- list(mean=mean, variance=variance)

	return(list(weight=weight, state=state))
	}
	return(list(create.state=create.state, update=update))
	}

	#' Create an AdaGrad optimizer with respective parameters.
	#' AdaGrad optimizer of Duchi et al., 2011,
	#'
	#' This code follows the version in http://arxiv.org/pdf/1212.5701v1.pdf Eq(5)
	#' by Matthew D. Zeiler, 2012. AdaGrad will help the network to converge faster
	#' in some cases.
	#'
	#' @param learning.rate float, default=0.05
	#' Step size.
	#' @param epsilon float, default=1e-8
	#' @param wd float, default=0.0
	#' L2 regularization coefficient add to all the weights.
	#' @param rescale.grad float, default=1.0
	#' rescaling factor of gradient.
	#' @param clip_gradient float, optional
	#' clip gradient in range [-clip_gradient, clip_gradient].
	#' @param lr_scheduler function, optional
	#' The learning rate scheduler.
	#'
	mx.opt.adagrad <- function(learning.rate=0.05,
	epsilon=1e-8,
	wd=0,
	rescale.grad=1,
	clip_gradient = NULL,
	lr_scheduler = NULL) {
	# use lr as short for learing rate.
	lr <- learning.rate
	count <- 0
	num_update <- 0

	adagrad <- new.env()
	adagrad$lr <- lr
	adagrad$count <- 0
	adagrad$num_update <- 0

	create.state <- function(index, weight) {
	return (mx.nd.zeros(dim(weight), ctx(weight))) #history
	}

	update <- function(index, weight, grad, state) {
	if (!is.null(lr_scheduler)){
	lr_scheduler(adagrad) ## changing lr
	lr <- adagrad$lr
	## update count
	indexKey <- paste0('ik', index)
	if (!exists(envir = adagrad, x = indexKey, inherits = FALSE)){
	adagrad[[indexKey]] <- 0
	} else {
	indexValue <- adagrad[[indexKey]]
	adagrad[[indexKey]] <- indexValue + 1
	adagrad$num_update <- max(adagrad$num_update, adagrad[[indexKey]])
	}
	}

	grad <- grad * rescale.grad
	if (!is.null(clip_gradient)){
	if(clip_gradient >= 0){
	grad_ctx <- ctx(grad)
	grad <- as.array(grad)
	grad <- pmax(grad, -1 * clip_gradient)
	grad <- pmin(grad, clip_gradient)
	grad <- mx.nd.array(grad, grad_ctx)
	} else {
	stop("Error: clip_gradient should be positive number.")
	}
	}

	history <- state
	history <- history + (grad * grad)
	weight <- weight - lr * (grad / mx.nd.sqrt(history + epsilon) + wd * weight)
	state <- history

	return(list(weight=weight, state=state))
	}
	return(list(create.state=create.state, update=update))
	}

	#' Create an AdaDelta optimizer with respective parameters.
	#'
	#' AdaDelta optimizer as described in Zeiler, M. D. (2012).
	#' ADADELTA: An adaptive learning rate method.
	#' http://arxiv.org/abs/1212.5701
	#'
	#' @param rho float, default=0.90
	#' Decay rate for both squared gradients and delta x.
	#' @param epsilon float, default=1e-5
	#' The constant as described in the thesis.
	#' @param wd float, default=0.0
	#' L2 regularization coefficient add to all the weights.
	#' @param rescale.grad float, default=1.0
	#' rescaling factor of gradient.
	#' @param clip_gradient float, optional
	#' clip gradient in range [-clip_gradient, clip_gradient].
	#'
	mx.opt.adadelta <- function(rho=0.90,
	epsilon=1e-5,
	wd=0,
	rescale.grad=1,
	clip_gradient = NULL) {
	adadelta <- new.env()

	create.state <- function(index, weight) {
	return (list(acc.g=mx.nd.zeros(dim(weight), ctx(weight)), # accumulated g
	acc.delta=mx.nd.zeros(dim(weight), ctx(weight)))) # accumulated delta
	}

	update <- function(index, weight, grad, state) {
	# preprocess grad
	grad <- grad * rescale.grad
	if (!is.null(clip_gradient)){
	if(clip_gradient >= 0){
	grad_ctx <- ctx(grad)
	grad <- as.array(grad)
	grad <- pmax(grad, -1 * clip_gradient)
	grad <- pmin(grad, clip_gradient)
	grad <- mx.nd.array(grad, grad_ctx)
	} else {
	stop("Error: clip_gradient should be positive number.")
	}
	}

	# accumulated g and delta initlization
	acc.g <- state$acc.g
	acc.delta <- state$acc.delta

	# update g, delta
	acc.g <- rho * acc.g + (1 - rho) * (grad * grad)
	current.delta <- mx.nd.sqrt(acc.delta + epsilon) / mx.nd.sqrt(acc.g + epsilon) * grad
	acc.delta <- rho * acc.delta + (1 - rho) * (current.delta * current.delta)
	weight <- weight - current.delta - wd * weight
	state <- list(acc.g=acc.g, acc.delta=acc.delta)

	return(list(weight=weight, state=state))
	}
	return(list(create.state=create.state, update=update))
	}

	#' Create an optimizer by name and parameters
	#'
	#' @param name The name of the optimizer
	#' @param ... Additional arguments
	#'
	#' @export
	mx.opt.create <- function(name, ...) {
	if (name == "sgd") {
	return(mx.opt.sgd(...))
	}
	else if (name == "rmsprop") {
	return (mx.opt.rmsprop(...))
	}
	else if (name == "adam") {
	return (mx.opt.adam(...))
	}
	else if (name == "adagrad") {
	return (mx.opt.adagrad(...))
	}
	else if (name == "adadelta") {
	return (mx.opt.adadelta(...))
	}
	stop(paste("Unknown optimizer ", name))
	}

	#' Get an updater closure that can take list of weight and gradient
	#' and return updated list of weight.
	#'
	#' @param optimizer The optimizer
	#' @param weights The weights to be optimized
	#'
	#' @export
	mx.opt.get.updater <- function(optimizer, weights) {
	n <- length(weights)
	# This is the list to keep track of internal states of optimzer
	state.list <- lapply(1:n, function(i) {
	if (is.null(weights[[i]])) return(NULL)
	optimizer$create.state(i, weights[[i]])
	})
	update <- optimizer$update

	update.closure <- function(weight, grad) {
	ulist <- lapply(1:n, function(i) {
	if (!is.null(grad[[i]])) {
	update(i, weight[[i]], grad[[i]], state.list[[i]])
	} else {
	return(NULL)
	}
	})
	# update state list, use mutate assignment
	state.list <<- lapply(ulist, function(x) {
	x$state
	})
	# return updated weight list
	weight.list <- lapply(ulist, function(x) {
	x$weight
	})
	return(weight.list)
	}
	return(update.closure)
	}