| #' Create an SGD optimizer with respective parameters. |
| #' Perform SGD with momentum update |
| #' |
| mx.opt.sgd <- function(learning.rate, |
| momentum=0, |
| wd=0, |
| rescale.grad=1, |
| clip_gradient = NULL, |
| lr_scheduler = NULL) { |
| # use lr as short for learing rate. |
| lr <- learning.rate |
| count <- 0 |
| num_update <- 0 |
| |
| sgd <- new.env() |
| sgd$lr <- lr |
| sgd$count <- 0 |
| sgd$num_update <- 0 |
| |
| create.state <- function(index, weight) { |
| if (momentum == 0) { |
| return(NULL) |
| } else { |
| ret <- (mx.nd.zeros(dim(weight), ctx(weight))) |
| return(ret) |
| } |
| } |
| update <- function(index, weight, grad, state) { |
| |
| if (!is.null(lr_scheduler)){ |
| lr_scheduler(sgd) ## changing lr |
| lr <- sgd$lr |
| ## update count |
| indexKey <- paste0('ik', index) |
| if (!exists(envir = sgd, x = indexKey, inherits = FALSE)){ |
| sgd[[indexKey]] <- 0 |
| } else { |
| indexValue <- sgd[[indexKey]] |
| sgd[[indexKey]] <- indexValue + 1 |
| sgd$num_update <- max(sgd$num_update, sgd[[indexKey]]) |
| } |
| } |
| grad <- grad * rescale.grad |
| if (!is.null(clip_gradient)){ |
| if(clip_gradient >= 0){ |
| grad_ctx <- ctx(grad) |
| grad <- as.array(grad) |
| grad <- pmax(grad, -1 * clip_gradient) |
| grad <- pmin(grad, clip_gradient) |
| grad <- mx.nd.array(grad, grad_ctx) |
| } else { |
| stop("Error: clip_gradient should be positive number.") |
| } |
| } |
| if (is.null(state)) { |
| weight <- weight - lr * (grad + wd * weight) |
| } else { |
| mom <- state |
| mom <- mom * momentum |
| mom <- mom - lr * (grad + wd * weight) |
| weight <- weight + mom |
| state <- mom |
| } |
| return(list(weight=weight, state=state)) |
| } |
| return(list(create.state=create.state, update=update)) |
| } |
| |
| #' Create an RMSProp optimizer with respective parameters. |
| #' Reference: Tieleman T, Hinton G. Lecture 6.5-rmsprop: Divide the gradient by a running average of its recent magnitude[J]. COURSERA: Neural Networks for Machine Learning, 2012, 4(2). |
| #' The code follows: http://arxiv.org/pdf/1308.0850v5.pdf Eq(38) - Eq(45) by Alex Graves, 2013. |
| #' |
| #' @param learning.rate float, default=0.002 |
| #' Step size. |
| #' @param gamma1 float, default=0.95 |
| #' decay factor of moving average for gradient, gradient^2. |
| #' @param gamm2 float, default=0.9 |
| #' "momentum" factor. |
| #' @param wd float, default=0.0 |
| #' L2 regularization coefficient add to all the weights. |
| #' @param rescale.grad float, default=1.0 |
| #' rescaling factor of gradient. |
| #' @param clip_gradient float, optional |
| #' clip gradient in range [-clip_gradient, clip_gradient]. |
| #' @param lr_scheduler function, optional |
| #' The learning rate scheduler. |
| #' |
| mx.opt.rmsprop <- function(learning.rate=0.002, |
| gamma1=0.95, |
| gamma2=0.9, |
| wd=0, |
| rescale.grad=1, |
| clip_gradient = NULL, |
| lr_scheduler = NULL) { |
| # use lr as short for learing rate. |
| lr <- learning.rate |
| count <- 0 |
| num_update <- 0 |
| |
| rmsprop <- new.env() |
| rmsprop$lr <- lr |
| rmsprop$count <- 0 |
| rmsprop$num_update <- 0 |
| |
| create.state <- function(index, weight) { |
| return (list(n=mx.nd.zeros(dim(weight), ctx(weight)), |
| g=mx.nd.zeros(dim(weight), ctx(weight)), |
| delta=mx.nd.zeros(dim(weight), ctx(weight)))) |
| } |
| |
| update <- function(index, weight, grad, state) { |
| if (!is.null(lr_scheduler)){ |
| lr_scheduler(rmsprop) ## changing lr |
| lr <- rmsprop$lr |
| ## update count |
| indexKey <- paste0('ik', index) |
| if (!exists(envir = rmsprop, x = indexKey, inherits = FALSE)){ |
| rmsprop[[indexKey]] <- 0 |
| } else { |
| indexValue <- rmsprop[[indexKey]] |
| rmsprop[[indexKey]] <- indexValue + 1 |
| rmsprop$num_update <- max(rmsprop$num_update, rmsprop[[indexKey]]) |
| } |
| } |
| grad <- grad * rescale.grad |
| if (!is.null(clip_gradient)){ |
| if(clip_gradient >= 0){ |
| grad_ctx <- ctx(grad) |
| grad <- as.array(grad) |
| grad <- pmax(grad, -1 * clip_gradient) |
| grad <- pmin(grad, clip_gradient) |
| grad <- mx.nd.array(grad, grad_ctx) |
| } else { |
| stop("Error: clip_gradient should be positive number.") |
| } |
| } |
| |
| n <- state$n |
| g <- state$g |
| delta <- state$delta |
| n <- gamma1 * n + (1 - gamma1) * (grad * grad) |
| g <- gamma1 * g + (1 - gamma1) * grad |
| delta <- gamma2 * delta - lr * (grad / mx.nd.sqrt(n - g*g + 1e-4) + wd * weight) |
| weight <- weight + delta |
| state <- list(n=n, g=g, delta=delta) |
| |
| return(list(weight=weight, state=state)) |
| } |
| return(list(create.state=create.state, update=update)) |
| } |
| |
| #' Create an Adam optimizer with respective parameters. |
| #' Adam optimizer as described in [King2014]. |
| #' |
| #' [King2014] Diederik Kingma, Jimmy Ba, |
| #' Adam: A Method for Stochastic Optimization, |
| #' http://arxiv.org/abs/1412.6980 |
| #' |
| #' @param learning.rate float, default=0.001 |
| #' Step size. |
| #' @param beta1 float, default=0.9 |
| #' Exponential decay rate for the first moment estimates. |
| #' @param beta2 float, default=0.999 |
| #' Exponential decay rate for the second moment estimates. |
| #' @param epsilon float, default=1e-8 |
| #' @param wd float, default=0.0 |
| #' L2 regularization coefficient add to all the weights. |
| #' @param rescale.grad float, default=1.0 |
| #' rescaling factor of gradient. |
| #' @param clip_gradient float, optional |
| #' clip gradient in range [-clip_gradient, clip_gradient]. |
| #' @param lr_scheduler function, optional |
| #' The learning rate scheduler. |
| #' |
| mx.opt.adam <- function(learning.rate=0.001, |
| beta1=0.9, |
| beta2=0.999, |
| epsilon=1e-8, |
| wd=0, |
| rescale.grad=1, |
| clip_gradient = NULL, |
| lr_scheduler = NULL) { |
| # use lr as short for learing rate. |
| lr <- learning.rate |
| count <- 0 |
| num_update <- 0 |
| |
| adam <- new.env() |
| adam$lr <- lr |
| adam$count <- 0 |
| adam$num_update <- 0 |
| |
| create.state <- function(index, weight) { |
| return (list(mean=mx.nd.zeros(dim(weight), ctx(weight)), |
| variance=mx.nd.zeros(dim(weight), ctx(weight)))) |
| } |
| |
| update <- function(index, weight, grad, state) { |
| if (!is.null(lr_scheduler)){ |
| lr_scheduler(adam) ## changing lr |
| lr <- adam$lr |
| ## update count |
| indexKey <- paste0('ik', index) |
| if (!exists(envir = adam, x = indexKey, inherits = FALSE)){ |
| adam[[indexKey]] <- 0 |
| } else { |
| indexValue <- adam[[indexKey]] |
| adam[[indexKey]] <- indexValue + 1 |
| adam$num_update <- max(adam$num_update, adam[[indexKey]]) |
| } |
| } |
| |
| # increment time |
| time.key <- paste0('t', index) |
| if (!exists(envir = adam, x = time.key, inherits = FALSE)){ |
| adam[[time.key]] <- 0 |
| } |
| t <- adam[[time.key]] |
| t <- t + 1 |
| adam[[time.key]] <- t |
| |
| mean <- state$mean |
| variance <- state$variance |
| |
| grad <- grad * rescale.grad |
| if (!is.null(clip_gradient)){ |
| if(clip_gradient >= 0){ |
| grad_ctx <- ctx(grad) |
| grad <- as.array(grad) |
| grad <- pmax(grad, -1 * clip_gradient) |
| grad <- pmin(grad, clip_gradient) |
| grad <- mx.nd.array(grad, grad_ctx) |
| } else { |
| stop("Error: clip_gradient should be positive number.") |
| } |
| } |
| |
| mean <- beta1 * mean + (1 - beta1) * grad |
| variance <- beta2 * variance + (1 - beta2) * (grad * grad) |
| |
| coef1 <- 1 - beta1^t |
| coef2 <- 1 - beta2^t |
| lr <- lr * sqrt(coef2)/coef1 |
| |
| weight <- weight - lr * mean / (mx.nd.sqrt(variance) + epsilon) |
| weight <- weight - lr * wd * weight |
| |
| state <- list(mean=mean, variance=variance) |
| |
| return(list(weight=weight, state=state)) |
| } |
| return(list(create.state=create.state, update=update)) |
| } |
| |
| #' Create an AdaGrad optimizer with respective parameters. |
| #' AdaGrad optimizer of Duchi et al., 2011, |
| #' |
| #' This code follows the version in http://arxiv.org/pdf/1212.5701v1.pdf Eq(5) |
| #' by Matthew D. Zeiler, 2012. AdaGrad will help the network to converge faster |
| #' in some cases. |
| #' |
| #' @param learning.rate float, default=0.05 |
| #' Step size. |
| #' @param epsilon float, default=1e-8 |
| #' @param wd float, default=0.0 |
| #' L2 regularization coefficient add to all the weights. |
| #' @param rescale.grad float, default=1.0 |
| #' rescaling factor of gradient. |
| #' @param clip_gradient float, optional |
| #' clip gradient in range [-clip_gradient, clip_gradient]. |
| #' @param lr_scheduler function, optional |
| #' The learning rate scheduler. |
| #' |
| mx.opt.adagrad <- function(learning.rate=0.05, |
| epsilon=1e-8, |
| wd=0, |
| rescale.grad=1, |
| clip_gradient = NULL, |
| lr_scheduler = NULL) { |
| # use lr as short for learing rate. |
| lr <- learning.rate |
| count <- 0 |
| num_update <- 0 |
| |
| adagrad <- new.env() |
| adagrad$lr <- lr |
| adagrad$count <- 0 |
| adagrad$num_update <- 0 |
| |
| create.state <- function(index, weight) { |
| return (mx.nd.zeros(dim(weight), ctx(weight))) #history |
| } |
| |
| update <- function(index, weight, grad, state) { |
| if (!is.null(lr_scheduler)){ |
| lr_scheduler(adagrad) ## changing lr |
| lr <- adagrad$lr |
| ## update count |
| indexKey <- paste0('ik', index) |
| if (!exists(envir = adagrad, x = indexKey, inherits = FALSE)){ |
| adagrad[[indexKey]] <- 0 |
| } else { |
| indexValue <- adagrad[[indexKey]] |
| adagrad[[indexKey]] <- indexValue + 1 |
| adagrad$num_update <- max(adagrad$num_update, adagrad[[indexKey]]) |
| } |
| } |
| |
| grad <- grad * rescale.grad |
| if (!is.null(clip_gradient)){ |
| if(clip_gradient >= 0){ |
| grad_ctx <- ctx(grad) |
| grad <- as.array(grad) |
| grad <- pmax(grad, -1 * clip_gradient) |
| grad <- pmin(grad, clip_gradient) |
| grad <- mx.nd.array(grad, grad_ctx) |
| } else { |
| stop("Error: clip_gradient should be positive number.") |
| } |
| } |
| |
| history <- state |
| history <- history + (grad * grad) |
| weight <- weight - lr * (grad / mx.nd.sqrt(history + epsilon) + wd * weight) |
| state <- history |
| |
| return(list(weight=weight, state=state)) |
| } |
| return(list(create.state=create.state, update=update)) |
| } |
| |
| #' Create an AdaDelta optimizer with respective parameters. |
| #' |
| #' AdaDelta optimizer as described in Zeiler, M. D. (2012). |
| #' *ADADELTA: An adaptive learning rate method.* |
| #' http://arxiv.org/abs/1212.5701 |
| #' |
| #' @param rho float, default=0.90 |
| #' Decay rate for both squared gradients and delta x. |
| #' @param epsilon float, default=1e-5 |
| #' The constant as described in the thesis. |
| #' @param wd float, default=0.0 |
| #' L2 regularization coefficient add to all the weights. |
| #' @param rescale.grad float, default=1.0 |
| #' rescaling factor of gradient. |
| #' @param clip_gradient float, optional |
| #' clip gradient in range [-clip_gradient, clip_gradient]. |
| #' |
| mx.opt.adadelta <- function(rho=0.90, |
| epsilon=1e-5, |
| wd=0, |
| rescale.grad=1, |
| clip_gradient = NULL) { |
| adadelta <- new.env() |
| |
| create.state <- function(index, weight) { |
| return (list(acc.g=mx.nd.zeros(dim(weight), ctx(weight)), # accumulated g |
| acc.delta=mx.nd.zeros(dim(weight), ctx(weight)))) # accumulated delta |
| } |
| |
| update <- function(index, weight, grad, state) { |
| # preprocess grad |
| grad <- grad * rescale.grad |
| if (!is.null(clip_gradient)){ |
| if(clip_gradient >= 0){ |
| grad_ctx <- ctx(grad) |
| grad <- as.array(grad) |
| grad <- pmax(grad, -1 * clip_gradient) |
| grad <- pmin(grad, clip_gradient) |
| grad <- mx.nd.array(grad, grad_ctx) |
| } else { |
| stop("Error: clip_gradient should be positive number.") |
| } |
| } |
| |
| # accumulated g and delta initlization |
| acc.g <- state$acc.g |
| acc.delta <- state$acc.delta |
| |
| # update g, delta |
| acc.g <- rho * acc.g + (1 - rho) * (grad * grad) |
| current.delta <- mx.nd.sqrt(acc.delta + epsilon) / mx.nd.sqrt(acc.g + epsilon) * grad |
| acc.delta <- rho * acc.delta + (1 - rho) * (current.delta * current.delta) |
| weight <- weight - current.delta - wd * weight |
| state <- list(acc.g=acc.g, acc.delta=acc.delta) |
| |
| return(list(weight=weight, state=state)) |
| } |
| return(list(create.state=create.state, update=update)) |
| } |
| |
| #' Create an optimizer by name and parameters |
| #' |
| #' @param name The name of the optimizer |
| #' @param ... Additional arguments |
| #' |
| #' @export |
| mx.opt.create <- function(name, ...) { |
| if (name == "sgd") { |
| return(mx.opt.sgd(...)) |
| } |
| else if (name == "rmsprop") { |
| return (mx.opt.rmsprop(...)) |
| } |
| else if (name == "adam") { |
| return (mx.opt.adam(...)) |
| } |
| else if (name == "adagrad") { |
| return (mx.opt.adagrad(...)) |
| } |
| else if (name == "adadelta") { |
| return (mx.opt.adadelta(...)) |
| } |
| stop(paste("Unknown optimizer ", name)) |
| } |
| |
| #' Get an updater closure that can take list of weight and gradient |
| #' and return updated list of weight. |
| #' |
| #' @param optimizer The optimizer |
| #' @param weights The weights to be optimized |
| #' |
| #' @export |
| mx.opt.get.updater <- function(optimizer, weights) { |
| n <- length(weights) |
| # This is the list to keep track of internal states of optimzer |
| state.list <- lapply(1:n, function(i) { |
| if (is.null(weights[[i]])) return(NULL) |
| optimizer$create.state(i, weights[[i]]) |
| }) |
| update <- optimizer$update |
| |
| update.closure <- function(weight, grad) { |
| ulist <- lapply(1:n, function(i) { |
| if (!is.null(grad[[i]])) { |
| update(i, weight[[i]], grad[[i]], state.list[[i]]) |
| } else { |
| return(NULL) |
| } |
| }) |
| # update state list, use mutate assignment |
| state.list <<- lapply(ulist, function(x) { |
| x$state |
| }) |
| # return updated weight list |
| weight.list <- lapply(ulist, function(x) { |
| x$weight |
| }) |
| return(weight.list) |
| } |
| return(update.closure) |
| } |