R-package/tests/testthat/test_optimizer.R - mxnet - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.

 context("optimizer")

 if (Sys.getenv("R_GPU_ENABLE") != "" & as.integer(Sys.getenv("R_GPU_ENABLE")) ==
 		1) {
 	mx.ctx.default(new = mx.gpu())
 	message("Using GPU for testing.")
 }

 test_that("sgd", {

   data <- mx.symbol.Variable("data")
   label <- mx.symbol.Variable("label")
   fc_weight <- mx.symbol.Variable("fc_weight")
   fc <- mx.symbol.FullyConnected(data = data, weight = fc_weight, no.bias = T,
     name = "fc1", num_hidden = 1)
   loss <- mx.symbol.LinearRegressionOutput(data = fc, label = label, name = "loss")

   x <- mx.nd.array(array(1:6, dim = 2:3))
   y <- mx.nd.array(c(5, 11, 16))
   w1 <- mx.nd.array(array(c(1.1, 1.8), dim = c(2, 1)))

   exec <- mxnet:::mx.symbol.bind(symbol = loss, ctx = mx.ctx.default(), arg.arrays = list(data = x,
     fc1_weight = w1, label = y), aux.arrays = NULL, grad.reqs = c("null", "write",
     "null"))

   optimizer <- mx.opt.create("sgd", learning.rate = 1, momentum = 0, wd = 0, rescale.grad = 1,
     clip_gradient = -1)

   updaters <- mx.opt.get.updater(optimizer, exec$ref.arg.arrays, ctx = mx.ctx.default())

   mx.exec.forward(exec, is.train = T)
   mx.exec.backward(exec)

   arg.blocks <- updaters(exec$ref.arg.arrays, exec$ref.grad.arrays)
   mx.exec.update.arg.arrays(exec, arg.blocks, skip.null = TRUE)

   expect_equal(as.array(arg.blocks[[2]]), array(c(1.4, 2.6), dim = c(2, 1)), tolerance = 0.1)

 })


 test_that("rmsprop", {

   data <- mx.symbol.Variable("data")
   label <- mx.symbol.Variable("label")
   fc_weight <- mx.symbol.Variable("fc_weight")
   fc <- mx.symbol.FullyConnected(data = data, weight = fc_weight, no.bias = T,
     name = "fc1", num_hidden = 1)
   loss <- mx.symbol.LinearRegressionOutput(data = fc, label = label, name = "loss")

   x <- mx.nd.array(array(1:6, dim = 2:3))
   y <- mx.nd.array(c(5, 11, 16))
   w1 <- mx.nd.array(array(c(1.1, 1.8), dim = c(2, 1)))

   exec <- mxnet:::mx.symbol.bind(symbol = loss, ctx = mx.ctx.default(), arg.arrays = list(data = x,
     fc1_weight = w1, label = y), aux.arrays = NULL, grad.reqs = c("null", "write",
     "null"))

   optimizer <- mx.opt.create("rmsprop", learning.rate = 1, centered = TRUE, gamma1 = 0.95,
     gamma2 = 0.9, epsilon = 1e-04, wd = 0, rescale.grad = 1, clip_gradient = -1)

   updaters <- mx.opt.get.updater(optimizer, exec$ref.arg.arrays, ctx = mx.ctx.default())

   mx.exec.forward(exec, is.train = T)
   mx.exec.backward(exec)

   arg.blocks <- updaters(exec$ref.arg.arrays, exec$ref.grad.arrays)
   mx.exec.update.arg.arrays(exec, arg.blocks, skip.null = TRUE)

   expect_equal(as.array(arg.blocks[[2]]), array(c(5.64, 6.38), dim = c(2, 1)),
     tolerance = 0.1)

 })


 test_that("adam", {

   data <- mx.symbol.Variable("data")
   label <- mx.symbol.Variable("label")
   fc_weight <- mx.symbol.Variable("fc_weight")
   fc <- mx.symbol.FullyConnected(data = data, weight = fc_weight, no.bias = T,
     name = "fc1", num_hidden = 1)
   loss <- mx.symbol.LinearRegressionOutput(data = fc, label = label, name = "loss")

   x <- mx.nd.array(array(1:6, dim = 2:3))
   y <- mx.nd.array(c(5, 11, 16))
   w1 <- mx.nd.array(array(c(1.1, 1.8), dim = c(2, 1)))

   exec <- mxnet:::mx.symbol.bind(symbol = loss, ctx = mx.ctx.default(), arg.arrays = list(data = x,
     fc1_weight = w1, label = y), aux.arrays = NULL, grad.reqs = c("null", "write",
     "null"))

   optimizer <- mx.opt.create("adam", learning.rate = 1, beta1 = 0.9, beta2 = 0.999,
     epsilon = 1e-08, wd = 0, rescale.grad = 1, clip_gradient = -1)

   updaters <- mx.opt.get.updater(optimizer, exec$ref.arg.arrays, ctx = mx.ctx.default())

   mx.exec.forward(exec, is.train = T)
   mx.exec.backward(exec)

   arg.blocks <- updaters(exec$ref.arg.arrays, exec$ref.grad.arrays)
   mx.exec.update.arg.arrays(exec, arg.blocks, skip.null = TRUE)

   expect_equal(as.array(arg.blocks[[2]]), array(c(4.26, 4.96), dim = c(2, 1)),
     tolerance = 0.1)

 })


 test_that("adagrad", {

   data <- mx.symbol.Variable("data")
   label <- mx.symbol.Variable("label")
   fc_weight <- mx.symbol.Variable("fc_weight")
   fc <- mx.symbol.FullyConnected(data = data, weight = fc_weight, no.bias = T,
     name = "fc1", num_hidden = 1)
   loss <- mx.symbol.LinearRegressionOutput(data = fc, label = label, name = "loss")

   x <- mx.nd.array(array(1:6, dim = 2:3))
   y <- mx.nd.array(c(5, 11, 16))
   w1 <- mx.nd.array(array(c(1.1, 1.8), dim = c(2, 1)))

   exec <- mxnet:::mx.symbol.bind(symbol = loss, ctx = mx.ctx.default(), arg.arrays = list(data = x,
     fc1_weight = w1, label = y), aux.arrays = NULL, grad.reqs = c("null", "write",
     "null"))

   optimizer <- mx.opt.create("adagrad", learning.rate = 1, epsilon = 1e-08, wd = 0,
     rescale.grad = 1, clip_gradient = -1)

   updaters <- mx.opt.get.updater(optimizer, exec$ref.arg.arrays, ctx = mx.ctx.default())

   mx.exec.forward(exec, is.train = T)
   mx.exec.backward(exec)

   arg.blocks <- updaters(exec$ref.arg.arrays, exec$ref.grad.arrays)
   mx.exec.update.arg.arrays(exec, arg.blocks, skip.null = TRUE)

   expect_equal(as.array(arg.blocks[[2]]), array(c(2.1, 2.8), dim = c(2, 1)), tolerance = 0.1)

 })


 test_that("adadelta", {

   data <- mx.symbol.Variable("data")
   label <- mx.symbol.Variable("label")
   fc_weight <- mx.symbol.Variable("fc_weight")
   fc <- mx.symbol.FullyConnected(data = data, weight = fc_weight, no.bias = T,
     name = "fc1", num_hidden = 1)
   loss <- mx.symbol.LinearRegressionOutput(data = fc, label = label, name = "loss")

   x <- mx.nd.array(array(1:6, dim = 2:3))
   y <- mx.nd.array(c(5, 11, 16))
   w1 <- mx.nd.array(array(c(1.1, 1.8), dim = c(2, 1)))

   exec <- mxnet:::mx.symbol.bind(symbol = loss, ctx = mx.ctx.default(), arg.arrays = list(data = x,
     fc1_weight = w1, label = y), aux.arrays = NULL, grad.reqs = c("null", "write",
     "null"))

   optimizer <- mx.opt.create("adadelta", rho = 0.9, epsilon = 1e-05, wd = 0, rescale.grad = 1,
     clip_gradient = -1)

   updaters <- mx.opt.get.updater(optimizer, exec$ref.arg.arrays, ctx = mx.ctx.default())

   mx.exec.forward(exec, is.train = T)
   mx.exec.backward(exec)

   arg.blocks <- updaters(exec$ref.arg.arrays, exec$ref.grad.arrays)
   mx.exec.update.arg.arrays(exec, arg.blocks, skip.null = TRUE)

   expect_equal(as.array(arg.blocks[[2]]), array(c(1.11, 1.81), dim = c(2, 1)),
     tolerance = 0.1)

 })


 test_that("nag_no_momentum", {
   data <- mx.symbol.Variable("data")
   label <- mx.symbol.Variable("label")
   fc_weight <- mx.symbol.Variable("fc_weight")
   fc <- mx.symbol.FullyConnected(data = data, weight = fc_weight, no.bias = T,
 	name = "fc1", num_hidden = 1)
   loss <- mx.symbol.LinearRegressionOutput(data = fc, label = label, name = "loss")

   x <- mx.nd.array(array(1:6, dim = 2:3))
 	y <- mx.nd.array(c(5, 11, 16))
 	w1 <- mx.nd.array(array(c(1.1, 1.8), dim = c(2, 1)))

 	exec <- mxnet:::mx.symbol.bind(symbol = loss, ctx = mx.ctx.default(), arg.arrays = list(data = x,
     fc1_weight = w1, label = y), aux.arrays = NULL, grad.reqs = c("null", "write", "null"))

   optimizer <- mx.opt.create("nag", learning.rate = 1, momentum = 0, wd = 0, rescale.grad = 1,
 	  clip_gradient = -1)

 	updaters <- mx.opt.get.updater(optimizer, exec$ref.arg.arrays, ctx = mx.ctx.default())

   mx.exec.forward(exec, is.train = T)
 	mx.exec.backward(exec)

   arg.blocks <- updaters(exec$ref.arg.arrays, exec$ref.grad.arrays)
 	mx.exec.update.arg.arrays(exec, arg.blocks, skip.null = TRUE)

   expect_equal(as.array(arg.blocks[[2]]), array(c(1.4, 2.6), dim = c(2, 1)), tolerance = 0.05)
 })


 test_that("nag_momentum", {
   data <- mx.symbol.Variable("data")
   label <- mx.symbol.Variable("label")
   fc_weight <- mx.symbol.Variable("fc_weight")
   fc <- mx.symbol.FullyConnected(data = data, weight = fc_weight, no.bias = T,
                                  name = "fc1", num_hidden = 1)
   loss <- mx.symbol.LinearRegressionOutput(data = fc, label = label, name = "loss")

   x <- mx.nd.array(array(1:6, dim = 2:3))
   y <- mx.nd.array(c(5, 11, 16))
   w1 <- mx.nd.array(array(c(1.1, 1.8), dim = c(2, 1)))

   exec <- mxnet:::mx.symbol.bind(symbol = loss, ctx = mx.ctx.default(), arg.arrays = list(data = x,
                                                                                           fc1_weight = w1, label = y), aux.arrays = NULL, grad.reqs = c("null", "write", "null"))

   optimizer <- mx.opt.create("nag", learning.rate = 1, momentum = 0.1, wd = 0, rescale.grad = 1,
                              clip_gradient = 5)

   updaters <- mx.opt.get.updater(optimizer, exec$ref.arg.arrays, ctx = mx.ctx.default())

   mx.exec.forward(exec, is.train = T)
   mx.exec.backward(exec)

   arg.blocks <- updaters(exec$ref.arg.arrays, exec$ref.grad.arrays)
   mx.exec.update.arg.arrays(exec, arg.blocks, skip.null = TRUE)

   expect_equal(as.array(arg.blocks[[2]]), array(c(1.45, 2.65), dim = c(2, 1)), tolerance = 0.1)
 })
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.

	context("optimizer")

	if (Sys.getenv("R_GPU_ENABLE") != "" & as.integer(Sys.getenv("R_GPU_ENABLE")) ==
	1) {
	mx.ctx.default(new = mx.gpu())
	message("Using GPU for testing.")
	}

	test_that("sgd", {

	data <- mx.symbol.Variable("data")
	label <- mx.symbol.Variable("label")
	fc_weight <- mx.symbol.Variable("fc_weight")
	fc <- mx.symbol.FullyConnected(data = data, weight = fc_weight, no.bias = T,
	name = "fc1", num_hidden = 1)
	loss <- mx.symbol.LinearRegressionOutput(data = fc, label = label, name = "loss")

	x <- mx.nd.array(array(1:6, dim = 2:3))
	y <- mx.nd.array(c(5, 11, 16))
	w1 <- mx.nd.array(array(c(1.1, 1.8), dim = c(2, 1)))

	exec <- mxnet:::mx.symbol.bind(symbol = loss, ctx = mx.ctx.default(), arg.arrays = list(data = x,
	fc1_weight = w1, label = y), aux.arrays = NULL, grad.reqs = c("null", "write",
	"null"))

	optimizer <- mx.opt.create("sgd", learning.rate = 1, momentum = 0, wd = 0, rescale.grad = 1,
	clip_gradient = -1)

	updaters <- mx.opt.get.updater(optimizer, exec$ref.arg.arrays, ctx = mx.ctx.default())

	mx.exec.forward(exec, is.train = T)
	mx.exec.backward(exec)

	arg.blocks <- updaters(exec$ref.arg.arrays, exec$ref.grad.arrays)
	mx.exec.update.arg.arrays(exec, arg.blocks, skip.null = TRUE)

	expect_equal(as.array(arg.blocks[[2]]), array(c(1.4, 2.6), dim = c(2, 1)), tolerance = 0.1)

	})


	test_that("rmsprop", {

	data <- mx.symbol.Variable("data")
	label <- mx.symbol.Variable("label")
	fc_weight <- mx.symbol.Variable("fc_weight")
	fc <- mx.symbol.FullyConnected(data = data, weight = fc_weight, no.bias = T,
	name = "fc1", num_hidden = 1)
	loss <- mx.symbol.LinearRegressionOutput(data = fc, label = label, name = "loss")

	x <- mx.nd.array(array(1:6, dim = 2:3))
	y <- mx.nd.array(c(5, 11, 16))
	w1 <- mx.nd.array(array(c(1.1, 1.8), dim = c(2, 1)))

	exec <- mxnet:::mx.symbol.bind(symbol = loss, ctx = mx.ctx.default(), arg.arrays = list(data = x,
	fc1_weight = w1, label = y), aux.arrays = NULL, grad.reqs = c("null", "write",
	"null"))

	optimizer <- mx.opt.create("rmsprop", learning.rate = 1, centered = TRUE, gamma1 = 0.95,
	gamma2 = 0.9, epsilon = 1e-04, wd = 0, rescale.grad = 1, clip_gradient = -1)

	updaters <- mx.opt.get.updater(optimizer, exec$ref.arg.arrays, ctx = mx.ctx.default())

	mx.exec.forward(exec, is.train = T)
	mx.exec.backward(exec)

	arg.blocks <- updaters(exec$ref.arg.arrays, exec$ref.grad.arrays)
	mx.exec.update.arg.arrays(exec, arg.blocks, skip.null = TRUE)

	expect_equal(as.array(arg.blocks[[2]]), array(c(5.64, 6.38), dim = c(2, 1)),
	tolerance = 0.1)

	})


	test_that("adam", {

	data <- mx.symbol.Variable("data")
	label <- mx.symbol.Variable("label")
	fc_weight <- mx.symbol.Variable("fc_weight")
	fc <- mx.symbol.FullyConnected(data = data, weight = fc_weight, no.bias = T,
	name = "fc1", num_hidden = 1)
	loss <- mx.symbol.LinearRegressionOutput(data = fc, label = label, name = "loss")

	x <- mx.nd.array(array(1:6, dim = 2:3))
	y <- mx.nd.array(c(5, 11, 16))
	w1 <- mx.nd.array(array(c(1.1, 1.8), dim = c(2, 1)))

	exec <- mxnet:::mx.symbol.bind(symbol = loss, ctx = mx.ctx.default(), arg.arrays = list(data = x,
	fc1_weight = w1, label = y), aux.arrays = NULL, grad.reqs = c("null", "write",
	"null"))

	optimizer <- mx.opt.create("adam", learning.rate = 1, beta1 = 0.9, beta2 = 0.999,
	epsilon = 1e-08, wd = 0, rescale.grad = 1, clip_gradient = -1)

	updaters <- mx.opt.get.updater(optimizer, exec$ref.arg.arrays, ctx = mx.ctx.default())

	mx.exec.forward(exec, is.train = T)
	mx.exec.backward(exec)

	arg.blocks <- updaters(exec$ref.arg.arrays, exec$ref.grad.arrays)
	mx.exec.update.arg.arrays(exec, arg.blocks, skip.null = TRUE)

	expect_equal(as.array(arg.blocks[[2]]), array(c(4.26, 4.96), dim = c(2, 1)),
	tolerance = 0.1)

	})


	test_that("adagrad", {

	data <- mx.symbol.Variable("data")
	label <- mx.symbol.Variable("label")
	fc_weight <- mx.symbol.Variable("fc_weight")
	fc <- mx.symbol.FullyConnected(data = data, weight = fc_weight, no.bias = T,
	name = "fc1", num_hidden = 1)
	loss <- mx.symbol.LinearRegressionOutput(data = fc, label = label, name = "loss")

	x <- mx.nd.array(array(1:6, dim = 2:3))
	y <- mx.nd.array(c(5, 11, 16))
	w1 <- mx.nd.array(array(c(1.1, 1.8), dim = c(2, 1)))

	exec <- mxnet:::mx.symbol.bind(symbol = loss, ctx = mx.ctx.default(), arg.arrays = list(data = x,
	fc1_weight = w1, label = y), aux.arrays = NULL, grad.reqs = c("null", "write",
	"null"))

	optimizer <- mx.opt.create("adagrad", learning.rate = 1, epsilon = 1e-08, wd = 0,
	rescale.grad = 1, clip_gradient = -1)

	updaters <- mx.opt.get.updater(optimizer, exec$ref.arg.arrays, ctx = mx.ctx.default())

	mx.exec.forward(exec, is.train = T)
	mx.exec.backward(exec)

	arg.blocks <- updaters(exec$ref.arg.arrays, exec$ref.grad.arrays)
	mx.exec.update.arg.arrays(exec, arg.blocks, skip.null = TRUE)

	expect_equal(as.array(arg.blocks[[2]]), array(c(2.1, 2.8), dim = c(2, 1)), tolerance = 0.1)

	})


	test_that("adadelta", {

	data <- mx.symbol.Variable("data")
	label <- mx.symbol.Variable("label")
	fc_weight <- mx.symbol.Variable("fc_weight")
	fc <- mx.symbol.FullyConnected(data = data, weight = fc_weight, no.bias = T,
	name = "fc1", num_hidden = 1)
	loss <- mx.symbol.LinearRegressionOutput(data = fc, label = label, name = "loss")

	x <- mx.nd.array(array(1:6, dim = 2:3))
	y <- mx.nd.array(c(5, 11, 16))
	w1 <- mx.nd.array(array(c(1.1, 1.8), dim = c(2, 1)))

	exec <- mxnet:::mx.symbol.bind(symbol = loss, ctx = mx.ctx.default(), arg.arrays = list(data = x,
	fc1_weight = w1, label = y), aux.arrays = NULL, grad.reqs = c("null", "write",
	"null"))

	optimizer <- mx.opt.create("adadelta", rho = 0.9, epsilon = 1e-05, wd = 0, rescale.grad = 1,
	clip_gradient = -1)

	updaters <- mx.opt.get.updater(optimizer, exec$ref.arg.arrays, ctx = mx.ctx.default())

	mx.exec.forward(exec, is.train = T)
	mx.exec.backward(exec)

	arg.blocks <- updaters(exec$ref.arg.arrays, exec$ref.grad.arrays)
	mx.exec.update.arg.arrays(exec, arg.blocks, skip.null = TRUE)

	expect_equal(as.array(arg.blocks[[2]]), array(c(1.11, 1.81), dim = c(2, 1)),
	tolerance = 0.1)

	})


	test_that("nag_no_momentum", {
	data <- mx.symbol.Variable("data")
	label <- mx.symbol.Variable("label")
	fc_weight <- mx.symbol.Variable("fc_weight")
	fc <- mx.symbol.FullyConnected(data = data, weight = fc_weight, no.bias = T,
	name = "fc1", num_hidden = 1)
	loss <- mx.symbol.LinearRegressionOutput(data = fc, label = label, name = "loss")

	x <- mx.nd.array(array(1:6, dim = 2:3))
	y <- mx.nd.array(c(5, 11, 16))
	w1 <- mx.nd.array(array(c(1.1, 1.8), dim = c(2, 1)))

	exec <- mxnet:::mx.symbol.bind(symbol = loss, ctx = mx.ctx.default(), arg.arrays = list(data = x,
	fc1_weight = w1, label = y), aux.arrays = NULL, grad.reqs = c("null", "write", "null"))

	optimizer <- mx.opt.create("nag", learning.rate = 1, momentum = 0, wd = 0, rescale.grad = 1,
	clip_gradient = -1)

	updaters <- mx.opt.get.updater(optimizer, exec$ref.arg.arrays, ctx = mx.ctx.default())

	mx.exec.forward(exec, is.train = T)
	mx.exec.backward(exec)

	arg.blocks <- updaters(exec$ref.arg.arrays, exec$ref.grad.arrays)
	mx.exec.update.arg.arrays(exec, arg.blocks, skip.null = TRUE)

	expect_equal(as.array(arg.blocks[[2]]), array(c(1.4, 2.6), dim = c(2, 1)), tolerance = 0.05)
	})


	test_that("nag_momentum", {
	data <- mx.symbol.Variable("data")
	label <- mx.symbol.Variable("label")
	fc_weight <- mx.symbol.Variable("fc_weight")
	fc <- mx.symbol.FullyConnected(data = data, weight = fc_weight, no.bias = T,
	name = "fc1", num_hidden = 1)
	loss <- mx.symbol.LinearRegressionOutput(data = fc, label = label, name = "loss")

	x <- mx.nd.array(array(1:6, dim = 2:3))
	y <- mx.nd.array(c(5, 11, 16))
	w1 <- mx.nd.array(array(c(1.1, 1.8), dim = c(2, 1)))

	exec <- mxnet:::mx.symbol.bind(symbol = loss, ctx = mx.ctx.default(), arg.arrays = list(data = x,
	fc1_weight = w1, label = y), aux.arrays = NULL, grad.reqs = c("null", "write", "null"))

	optimizer <- mx.opt.create("nag", learning.rate = 1, momentum = 0.1, wd = 0, rescale.grad = 1,
	clip_gradient = 5)

	updaters <- mx.opt.get.updater(optimizer, exec$ref.arg.arrays, ctx = mx.ctx.default())

	mx.exec.forward(exec, is.train = T)
	mx.exec.backward(exec)

	arg.blocks <- updaters(exec$ref.arg.arrays, exec$ref.grad.arrays)
	mx.exec.update.arg.arrays(exec, arg.blocks, skip.null = TRUE)

	expect_equal(as.array(arg.blocks[[2]]), array(c(1.45, 2.65), dim = c(2, 1)), tolerance = 0.1)
	})