| #------------------------------------------------------------- |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| #------------------------------------------------------------- |
| |
| /* |
| * Gradient checks for various architectures. |
| */ |
| source("scripts/nn/layers/affine.dml") as affine |
| source("scripts/nn/layers/low_rank_affine.dml") as low_rank_affine |
| source("scripts/nn/layers/batch_norm1d.dml") as batch_norm1d |
| source("scripts/nn/layers/batch_norm2d.dml") as batch_norm2d |
| source("scripts/nn/layers/conv2d.dml") as conv2d |
| source("scripts/nn/layers/conv2d_builtin.dml") as conv2d_builtin |
| source("scripts/nn/layers/conv2d_depthwise.dml") as conv2d_depthwise |
| source("scripts/nn/layers/conv2d_transpose.dml") as conv2d_transpose |
| source("scripts/nn/layers/conv2d_transpose_depthwise.dml") as conv2d_transpose_depthwise |
| source("scripts/nn/layers/cross_entropy_loss.dml") as cross_entropy_loss |
| source("scripts/nn/layers/cross_entropy_loss2d.dml") as cross_entropy_loss2d |
| source("scripts/nn/layers/dropout.dml") as dropout |
| source("scripts/nn/layers/fm.dml") as fm |
| source("scripts/nn/layers/l1_loss.dml") as l1_loss |
| source("scripts/nn/layers/l1_reg.dml") as l1_reg |
| source("scripts/nn/layers/l2_loss.dml") as l2_loss |
| source("scripts/nn/layers/l2_reg.dml") as l2_reg |
| source("scripts/nn/layers/log_loss.dml") as log_loss |
| source("scripts/nn/layers/lstm.dml") as lstm |
| source("scripts/nn/layers/max_pool2d.dml") as max_pool2d |
| source("scripts/nn/layers/max_pool2d_builtin.dml") as max_pool2d_builtin |
| source("scripts/nn/layers/avg_pool2d_builtin.dml") as avg_pool2d_builtin |
| source("scripts/nn/layers/upsample2d.dml") as upsample2d |
| source("scripts/nn/layers/relu.dml") as relu |
| source("scripts/nn/layers/rnn.dml") as rnn |
| source("scripts/nn/layers/scale_shift1d.dml") as scale_shift1d |
| source("scripts/nn/layers/scale_shift2d.dml") as scale_shift2d |
| source("scripts/nn/layers/sigmoid.dml") as sigmoid |
| source("scripts/nn/layers/softmax.dml") as softmax |
| source("scripts/nn/layers/softmax2d.dml") as softmax2d |
| source("scripts/nn/layers/tanh.dml") as tanh |
| source("scripts/nn/test/conv2d_simple.dml") as conv2d_simple |
| source("scripts/nn/test/max_pool2d_simple.dml") as max_pool2d_simple |
| source("scripts/nn/test/util.dml") as test_util |
| source("scripts/nn/util.dml") as util |
| source("scripts/nn/layers/elu.dml") as elu |
| |
| affine = function() { |
| /* |
| * Gradient check for the affine layer. |
| */ |
| print("Grad checking the affine layer with L2 loss.") |
| |
| # Generate data |
| N = 3 # num examples |
| D = 100 # num features |
| M = 10 # num neurons |
| X = rand(rows=N, cols=D) |
| y = rand(rows=N, cols=M) |
| [W, b] = affine::init(D, M) |
| |
| # Compute analytical gradients of loss wrt parameters |
| out = affine::forward(X, W, b) |
| dout = l2_loss::backward(out, y) |
| [dX, dW, db] = affine::backward(dout, X, W, b) |
| |
| # Grad check |
| h = 1e-5 |
| print(" - Grad checking X.") |
| for (i in 1:nrow(X)) { |
| for (j in 1:ncol(X)) { |
| # Compute numerical derivative |
| old = as.scalar(X[i,j]) |
| X[i,j] = old - h |
| outmh = affine::forward(X, W, b) |
| lossmh = l2_loss::forward(outmh, y) |
| X[i,j] = old + h |
| outph = affine::forward(X, W, b) |
| lossph = l2_loss::forward(outph, y) |
| X[i,j] = old # reset |
| dX_num = (lossph-lossmh) / (2*h) # numerical derivative |
| |
| # Check error |
| rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh) |
| } |
| } |
| |
| print(" - Grad checking W.") |
| for (i in 1:nrow(W)) { |
| for (j in 1:ncol(W)) { |
| # Compute numerical derivative |
| old = as.scalar(W[i,j]) |
| W[i,j] = old - h |
| outmh = affine::forward(X, W, b) |
| lossmh = l2_loss::forward(outmh, y) |
| W[i,j] = old + h |
| outph = affine::forward(X, W, b) |
| lossph = l2_loss::forward(outph, y) |
| W[i,j] = old # reset |
| dW_num = (lossph-lossmh) / (2*h) # numerical derivative |
| |
| # Check error |
| rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh) |
| } |
| } |
| |
| print(" - Grad checking b.") |
| for (i in 1:nrow(b)) { |
| for (j in 1:ncol(b)) { |
| # Compute numerical derivative |
| old = as.scalar(b[i,j]) |
| b[i,j] = old - h |
| outmh = affine::forward(X, W, b) |
| lossmh = l2_loss::forward(outmh, y) |
| b[i,j] = old + h |
| outph = affine::forward(X, W, b) |
| lossph = l2_loss::forward(outph, y) |
| b[i,j] = old # reset |
| db_num = (lossph-lossmh) / (2*h) # numerical derivative |
| |
| # Check error |
| rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh) |
| } |
| } |
| } |
| |
| low_rank_affine = function() { |
| /* |
| * Gradient check for the low rank affine layer. |
| */ |
| print("Grad checking the low rank affine layer with L2 loss.") |
| |
| # Generate data |
| N = 3 # num examples |
| D = 100 # num features |
| M = 10 # num neurons |
| R = 2 # rank |
| X = rand(rows=N, cols=D) |
| y = rand(rows=N, cols=M) |
| [U, V, b] = low_rank_affine::init(D, M, R) |
| |
| # Compute analytical gradients of loss wrt parameters |
| out = low_rank_affine::forward(X, U, V, b) |
| dout = l2_loss::backward(out, y) |
| [dX, dU, dV, db] = low_rank_affine::backward(dout, X, U, V, b) |
| |
| # Grad check |
| h = 1e-5 |
| print(" - Grad checking X.") |
| for (i in 1:nrow(X)) { |
| for (j in 1:ncol(X)) { |
| # Compute numerical derivative |
| old = as.scalar(X[i,j]) |
| X[i,j] = old - h |
| outmh = low_rank_affine::forward(X, U, V, b) |
| lossmh = l2_loss::forward(outmh, y) |
| X[i,j] = old + h |
| outph = low_rank_affine::forward(X, U, V, b) |
| lossph = l2_loss::forward(outph, y) |
| X[i,j] = old # reset |
| dX_num = (lossph-lossmh) / (2*h) # numerical derivative |
| |
| # Check error |
| rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh) |
| } |
| } |
| |
| print(" - Grad checking U.") |
| for (i in 1:nrow(U)) { |
| for (j in 1:ncol(U)) { |
| # Compute numerical derivative |
| old = as.scalar(U[i,j]) |
| U[i,j] = old - h |
| outmh = low_rank_affine::forward(X, U, V, b) |
| lossmh = l2_loss::forward(outmh, y) |
| U[i,j] = old + h |
| outph = low_rank_affine::forward(X, U, V, b) |
| lossph = l2_loss::forward(outph, y) |
| U[i,j] = old # reset |
| dU_num = (lossph-lossmh) / (2*h) # numerical derivative |
| |
| # Check error |
| rel_error = test_util::check_rel_grad_error(as.scalar(dU[i,j]), dU_num, lossph, lossmh) |
| } |
| } |
| |
| print(" - Grad checking V.") |
| for (i in 1:nrow(V)) { |
| for (j in 1:ncol(V)) { |
| # Compute numerical derivative |
| old = as.scalar(V[i,j]) |
| V[i,j] = old - h |
| outmh = low_rank_affine::forward(X, U, V, b) |
| lossmh = l2_loss::forward(outmh, y) |
| V[i,j] = old + h |
| outph = low_rank_affine::forward(X, U, V, b) |
| lossph = l2_loss::forward(outph, y) |
| V[i,j] = old # reset |
| dV_num = (lossph-lossmh) / (2*h) # numerical derivative |
| |
| # Check error |
| rel_error = test_util::check_rel_grad_error(as.scalar(dV[i,j]), dV_num, lossph, lossmh) |
| } |
| } |
| |
| print(" - Grad checking b.") |
| for (i in 1:nrow(b)) { |
| for (j in 1:ncol(b)) { |
| # Compute numerical derivative |
| old = as.scalar(b[i,j]) |
| b[i,j] = old - h |
| outmh = low_rank_affine::forward(X, U, V, b) |
| lossmh = l2_loss::forward(outmh, y) |
| b[i,j] = old + h |
| outph = low_rank_affine::forward(X, U, V, b) |
| lossph = l2_loss::forward(outph, y) |
| b[i,j] = old # reset |
| db_num = (lossph-lossmh) / (2*h) # numerical derivative |
| |
| # Check error |
| rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh) |
| } |
| } |
| } |
| |
| batch_norm1d = function() { |
| /* |
| * Gradient check for the 1D batch normalization layer. |
| */ |
| print("Grad checking the 1D batch normalization layer with L2 loss.") |
| |
| # Generate data |
| N = 3 # num examples |
| D = 100 # num features |
| mu = 0.9 # momentum |
| eps = 1e-5 # epsilon |
| X = rand(rows=N, cols=D) |
| y = rand(rows=N, cols=D) |
| gamma = rand(rows=1, cols=D) |
| beta = rand(rows=1, cols=D) |
| ema_mean = rand(rows=1, cols=D) |
| ema_var = rand(rows=1, cols=D) |
| #[dummy, dummy, ema_mean, ema_var] = batch_norm1d::init(D) |
| |
| # Check training & testing modes |
| for (i in 1:2) { |
| if (i == 1) |
| mode = 'train' |
| else |
| mode = 'test' |
| print(" - Grad checking the '"+mode+"' mode.") |
| |
| # Compute analytical gradients of loss wrt parameters |
| [out, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] = |
| batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps) |
| dout = l2_loss::backward(out, y) |
| [dX, dgamma, dbeta] = batch_norm1d::backward(dout, out, ema_mean_upd, ema_var_upd, |
| cache_mean, cache_var, cache_norm, |
| X, gamma, beta, mode, ema_mean, ema_var, mu, eps) |
| |
| # Grad check |
| h = 1e-5 |
| print(" - Grad checking X.") |
| for (i in 1:nrow(X)) { |
| for (j in 1:ncol(X)) { |
| # Compute numerical derivative |
| old = as.scalar(X[i,j]) |
| X[i,j] = old - h |
| [outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] = |
| batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps) |
| lossmh = l2_loss::forward(outmh, y) |
| X[i,j] = old + h |
| [outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] = |
| batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps) |
| lossph = l2_loss::forward(outph, y) |
| X[i,j] = old # reset |
| dX_num = (lossph-lossmh) / (2*h) # numerical derivative |
| |
| # Check error |
| rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh) |
| } |
| } |
| |
| print(" - Grad checking gamma.") |
| for (i in 1:nrow(gamma)) { |
| for (j in 1:ncol(gamma)) { |
| # Compute numerical derivative |
| old = as.scalar(gamma[i,j]) |
| gamma[i,j] = old - h |
| [outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] = |
| batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps) |
| lossmh = l2_loss::forward(outmh, y) |
| gamma[i,j] = old + h |
| [outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] = |
| batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps) |
| lossph = l2_loss::forward(outph, y) |
| gamma[i,j] = old # reset |
| dgamma_num = (lossph-lossmh) / (2*h) # numerical derivative |
| |
| # Check error |
| rel_error = test_util::check_rel_grad_error(as.scalar(dgamma[i,j]), dgamma_num, |
| lossph, lossmh) |
| } |
| } |
| |
| print(" - Grad checking beta.") |
| for (i in 1:nrow(beta)) { |
| for (j in 1:ncol(beta)) { |
| # Compute numerical derivative |
| old = as.scalar(beta[i,j]) |
| beta[i,j] = old - h |
| [outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] = |
| batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps) |
| lossmh = l2_loss::forward(outmh, y) |
| beta[i,j] = old + h |
| [outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] = |
| batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps) |
| lossph = l2_loss::forward(outph, y) |
| beta[i,j] = old # reset |
| dbeta_num = (lossph-lossmh) / (2*h) # numerical derivative |
| |
| # Check error |
| rel_error = test_util::check_rel_grad_error(as.scalar(dbeta[i,j]), dbeta_num, |
| lossph, lossmh) |
| } |
| } |
| } |
| } |
| |
| batch_norm2d = function() { |
| /* |
| * Gradient check for the 2D (spatial) batch normalization layer. |
| */ |
| print("Grad checking the 2D (spatial) batch normalization layer with L2 loss.") |
| |
| # Generate data |
| N = 3 # num examples |
| C = 2 # num channels |
| Hin = 5 # input height |
| Win = 5 # input width |
| mu = 0.9 # momentum |
| eps = 1e-5 # epsilon |
| X = rand(rows=N, cols=C*Hin*Win) |
| y = rand(rows=N, cols=C*Hin*Win) |
| gamma = rand(rows=C, cols=1) |
| beta = rand(rows=C, cols=1) |
| ema_mean = rand(rows=C, cols=1) |
| ema_var = rand(rows=C, cols=1) |
| #[dummy, dummy, ema_mean, ema_var] = batch_norm2d::init(C) |
| |
| # Check training & testing modes |
| # for (i in 1:1) { |
| mode = 'train' |
| print(" - Grad checking the '"+mode+"' mode.") |
| |
| # Compute analytical gradients of loss wrt parameters |
| [out, ema_mean_upd, ema_var_upd, cache_mean, cache_var] = |
| batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps) |
| dout = l2_loss::backward(out, y) |
| [dX, dgamma, dbeta] = batch_norm2d::backward(dout, cache_mean, cache_var, |
| X, gamma, C, Hin, Win, eps) |
| |
| # Grad check |
| h = 1e-5 |
| print(" - Grad checking X.") |
| for (i in 1:nrow(X)) { |
| for (j in 1:ncol(X)) { |
| # Compute numerical derivative |
| old = as.scalar(X[i,j]) |
| X[i,j] = old - h |
| [outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var] = |
| batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps) |
| lossmh = l2_loss::forward(outmh, y) |
| X[i,j] = old + h |
| [outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var] = |
| batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps) |
| lossph = l2_loss::forward(outph, y) |
| X[i,j] = old # reset |
| dX_num = (lossph-lossmh) / (2*h) # numerical derivative |
| |
| # Check error |
| rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh) |
| } |
| } |
| |
| print(" - Grad checking gamma.") |
| for (i in 1:nrow(gamma)) { |
| for (j in 1:ncol(gamma)) { |
| # Compute numerical derivative |
| old = as.scalar(gamma[i,j]) |
| gamma[i,j] = old - h |
| [outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var] = |
| batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps) |
| lossmh = l2_loss::forward(outmh, y) |
| gamma[i,j] = old + h |
| [outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var] = |
| batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps) |
| lossph = l2_loss::forward(outph, y) |
| gamma[i,j] = old # reset |
| dgamma_num = (lossph-lossmh) / (2*h) # numerical derivative |
| |
| # Check error |
| rel_error = test_util::check_rel_grad_error(as.scalar(dgamma[i,j]), dgamma_num, |
| lossph, lossmh) |
| } |
| } |
| |
| print(" - Grad checking beta.") |
| for (i in 1:nrow(beta)) { |
| for (j in 1:ncol(beta)) { |
| # Compute numerical derivative |
| old = as.scalar(beta[i,j]) |
| beta[i,j] = old - h |
| [outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var] = |
| batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps) |
| lossmh = l2_loss::forward(outmh, y) |
| beta[i,j] = old + h |
| [outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var] = |
| batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps) |
| lossph = l2_loss::forward(outph, y) |
| beta[i,j] = old # reset |
| dbeta_num = (lossph-lossmh) / (2*h) # numerical derivative |
| |
| # Check error |
| rel_error = test_util::check_rel_grad_error(as.scalar(dbeta[i,j]), dbeta_num, |
| lossph, lossmh) |
| } |
| } |
| # } |
| } |
| |
| conv2d = function() { |
| /* |
| * Gradient check for the 2D convolutional layer using `im2col`. |
| */ |
| print("Grad checking the `im2col` 2D convolutional layer with L2 loss.") |
| |
| # Generate data |
| N = 2 # num examples |
| C = 3 # num channels |
| Hin = 5 # input height |
| Win = 5 # input width |
| F = 4 # num filters |
| Hf = 3 # filter height |
| Wf = 3 # filter width |
| stride = 1 |
| pad = 1 |
| X = rand(rows=N, cols=C*Hin*Win) |
| y = rand(rows=N, cols=F*Hin*Win) |
| |
| # Create layers |
| [W, b] = conv2d::init(F, C, Hf, Wf) |
| |
| # Compute analytical gradients of loss wrt parameters |
| [out, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad) |
| dout = l2_loss::backward(out, y) |
| [dX, dW, db] = conv2d::backward(dout, Hout, Wout, X, W, b, C, Hin, Win, Hf, Wf, stride, stride, |
| pad, pad) |
| |
| # Grad check |
| h = 1e-5 |
| print(" - Grad checking X.") |
| for (i in 1:nrow(X)) { |
| for (j in 1:ncol(X)) { |
| # Compute numerical derivative |
| old = as.scalar(X[i,j]) |
| X[i,j] = old - h |
| [outmh, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad) |
| lossmh = l2_loss::forward(outmh, y) |
| X[i,j] = old + h |
| [outph, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad) |
| lossph = l2_loss::forward(outph, y) |
| X[i,j] = old # reset |
| dX_num = (lossph-lossmh) / (2*h) # numerical derivative |
| |
| # Check error |
| rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh) |
| } |
| } |
| |
| print(" - Grad checking W.") |
| for (i in 1:nrow(W)) { |
| for (j in 1:ncol(W)) { |
| # Compute numerical derivative |
| old = as.scalar(W[i,j]) |
| W[i,j] = old - h |
| [outmh, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad) |
| lossmh = l2_loss::forward(outmh, y) |
| W[i,j] = old + h |
| [outph, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad) |
| lossph = l2_loss::forward(outph, y) |
| W[i,j] = old # reset |
| dW_num = (lossph-lossmh) / (2*h) # numerical derivative |
| |
| # Check error |
| rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh) |
| } |
| } |
| |
| print(" - Grad checking b.") |
| for (i in 1:nrow(b)) { |
| for (j in 1:ncol(b)) { |
| # Compute numerical derivative |
| old = as.scalar(b[i,j]) |
| b[i,j] = old - h |
| [outmh, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad) |
| lossmh = l2_loss::forward(outmh, y) |
| b[i,j] = old + h |
| [outph, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad) |
| lossph = l2_loss::forward(outph, y) |
| b[i,j] = old # reset |
| db_num = (lossph-lossmh) / (2*h) # numerical derivative |
| |
| # Check error |
| rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh) |
| } |
| } |
| } |
| |
| conv2d_builtin = function() { |
| /* |
| * Gradient check for the 2D convolutional layer using built-in |
| * functions. |
| */ |
| print("Grad checking the built-in 2D convolutional layer with L2 loss.") |
| |
| # Generate data |
| N = 2 # num examples |
| C = 3 # num channels |
| Hin = 5 # input height |
| Win = 5 # input width |
| F = 4 # num filters |
| Hf = 3 # filter height |
| Wf = 3 # filter width |
| stride = 1 |
| pad = 1 |
| X = rand(rows=N, cols=C*Hin*Win) |
| y = rand(rows=N, cols=F*Hin*Win) |
| |
| # Create layers |
| [W, b] = conv2d_builtin::init(F, C, Hf, Wf) |
| |
| # Compute analytical gradients of loss wrt parameters |
| [out, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, |
| pad, pad) |
| dout = l2_loss::backward(out, y) |
| [dX, dW, db] = conv2d_builtin::backward(dout, Hout, Wout, X, W, b, C, Hin, Win, Hf, Wf, |
| stride, stride, pad, pad) |
| |
| # Grad check |
| h = 1e-5 |
| print(" - Grad checking X.") |
| for (i in 1:nrow(X)) { |
| for (j in 1:ncol(X)) { |
| # Compute numerical derivative |
| old = as.scalar(X[i,j]) |
| X[i,j] = old - h |
| [outmh, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, |
| pad, pad) |
| lossmh = l2_loss::forward(outmh, y) |
| X[i,j] = old + h |
| [outph, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, |
| pad, pad) |
| lossph = l2_loss::forward(outph, y) |
| X[i,j] = old # reset |
| dX_num = (lossph-lossmh) / (2*h) # numerical derivative |
| |
| # Check error |
| rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh) |
| } |
| } |
| |
| print(" - Grad checking W.") |
| for (i in 1:nrow(W)) { |
| for (j in 1:ncol(W)) { |
| # Compute numerical derivative |
| old = as.scalar(W[i,j]) |
| W[i,j] = old - h |
| [outmh, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, |
| pad, pad) |
| lossmh = l2_loss::forward(outmh, y) |
| W[i,j] = old + h |
| [outph, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, |
| pad, pad) |
| lossph = l2_loss::forward(outph, y) |
| W[i,j] = old # reset |
| dW_num = (lossph-lossmh) / (2*h) # numerical derivative |
| |
| # Check error |
| rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh) |
| } |
| } |
| |
| print(" - Grad checking b.") |
| for (i in 1:nrow(b)) { |
| for (j in 1:ncol(b)) { |
| # Compute numerical derivative |
| old = as.scalar(b[i,j]) |
| b[i,j] = old - h |
| [outmh, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, |
| pad, pad) |
| lossmh = l2_loss::forward(outmh, y) |
| b[i,j] = old + h |
| [outph, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, |
| pad, pad) |
| lossph = l2_loss::forward(outph, y) |
| b[i,j] = old # reset |
| db_num = (lossph-lossmh) / (2*h) # numerical derivative |
| |
| # Check error |
| rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh) |
| } |
| } |
| } |
| |
| conv2d_simple = function() { |
| /* |
| * Gradient check for the simple reference 2D convolutional layer. |
| */ |
| print("Grad checking the simple reference 2D convolutional layer with L2 loss.") |
| |
| # Generate data |
| N = 2 # num examples |
| C = 3 # num channels |
| Hin = 5 # input height |
| Win = 5 # input width |
| F = 4 # num filters |
| Hf = 3 # filter height |
| Wf = 3 # filter width |
| stride = 1 |
| pad = 1 |
| X = rand(rows=N, cols=C*Hin*Win) |
| y = rand(rows=N, cols=F*Hin*Win) |
| |
| # Create layers |
| [W, b] = conv2d_simple::init(F, C, Hf, Wf) |
| |
| # Compute analytical gradients of loss wrt parameters |
| [out, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad) |
| dout = l2_loss::backward(out, y) |
| [dX, dW, db] = conv2d_simple::backward(dout, Hout, Wout, X, W, b, C, Hin, Win, Hf, Wf, |
| stride, stride, pad, pad) |
| |
| # Grad check |
| h = 1e-5 |
| print(" - Grad checking X.") |
| for (i in 1:nrow(X)) { |
| for (j in 1:ncol(X)) { |
| # Compute numerical derivative |
| old = as.scalar(X[i,j]) |
| X[i,j] = old - h |
| [outmh, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, |
| pad, pad) |
| lossmh = l2_loss::forward(outmh, y) |
| X[i,j] = old + h |
| [outph, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, |
| pad, pad) |
| lossph = l2_loss::forward(outph, y) |
| X[i,j] = old # reset |
| dX_num = (lossph-lossmh) / (2*h) # numerical derivative |
| |
| # Check error |
| rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh) |
| } |
| } |
| |
| print(" - Grad checking W.") |
| for (i in 1:nrow(W)) { |
| for (j in 1:ncol(W)) { |
| # Compute numerical derivative |
| old = as.scalar(W[i,j]) |
| W[i,j] = old - h |
| [outmh, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, |
| pad, pad) |
| lossmh = l2_loss::forward(outmh, y) |
| W[i,j] = old + h |
| [outph, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, |
| pad, pad) |
| lossph = l2_loss::forward(outph, y) |
| W[i,j] = old # reset |
| dW_num = (lossph-lossmh) / (2*h) # numerical derivative |
| |
| # Check error |
| rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh) |
| } |
| } |
| |
| print(" - Grad checking b.") |
| for (i in 1:nrow(b)) { |
| for (j in 1:ncol(b)) { |
| # Compute numerical derivative |
| old = as.scalar(b[i,j]) |
| b[i,j] = old - h |
| [outmh, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, |
| pad, pad) |
| lossmh = l2_loss::forward(outmh, y) |
| b[i,j] = old + h |
| [outph, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, |
| pad, pad) |
| lossph = l2_loss::forward(outph, y) |
| b[i,j] = old # reset |
| db_num = (lossph-lossmh) / (2*h) # numerical derivative |
| |
| # Check error |
| rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh) |
| } |
| } |
| } |
| |
| conv2d_depthwise = function() { |
| /* |
| * Gradient check for the 2D depthwise convolutional layer. |
| */ |
| print("Grad checking the 2D depthwise convolutional layer with L2 loss.") |
| |
| # Generate data |
| N = 2 # num examples |
| C = 3 # num channels |
| Hin = 5 # input height |
| Win = 5 # input width |
| M = 4 # depth multiplier |
| Hf = 3 # filter height |
| Wf = 3 # filter width |
| stride = 1 |
| pad = 1 |
| X = rand(rows=N, cols=C*Hin*Win) |
| y = rand(rows=N, cols=C*M*Hin*Win) |
| |
| # Create layers |
| [W, b] = conv2d_depthwise::init(C, M, Hf, Wf) |
| |
| # Compute analytical gradients of loss wrt parameters |
| [out, Hout, Wout] = conv2d_depthwise::forward(X, W, b, Hin, Win, M, Hf, Wf, stride, stride, |
| pad, pad) |
| dout = l2_loss::backward(out, y) |
| [dX, dW, db] = conv2d_depthwise::backward(dout, Hout, Wout, X, W, b, Hin, Win, M, Hf, Wf, |
| stride, stride, pad, pad) |
| |
| # Grad check |
| h = 1e-5 |
| print(" - Grad checking X.") |
| for (i in 1:nrow(X)) { |
| for (j in 1:ncol(X)) { |
| # Compute numerical derivative |
| old = as.scalar(X[i,j]) |
| X[i,j] = old - h |
| [outmh, Hout, Wout] = conv2d_depthwise::forward(X, W, b, Hin, Win, M, Hf, Wf, stride, stride, |
| pad, pad) |
| lossmh = l2_loss::forward(outmh, y) |
| X[i,j] = old + h |
| [outph, Hout, Wout] = conv2d_depthwise::forward(X, W, b, Hin, Win, M, Hf, Wf, stride, stride, |
| pad, pad) |
| lossph = l2_loss::forward(outph, y) |
| X[i,j] = old # reset |
| dX_num = (lossph-lossmh) / (2*h) # numerical derivative |
| |
| # Check error |
| rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh) |
| } |
| } |
| |
| print(" - Grad checking W.") |
| for (i in 1:nrow(W)) { |
| for (j in 1:ncol(W)) { |
| # Compute numerical derivative |
| old = as.scalar(W[i,j]) |
| W[i,j] = old - h |
| [outmh, Hout, Wout] = conv2d_depthwise::forward(X, W, b, Hin, Win, M, Hf, Wf, stride, stride, |
| pad, pad) |
| lossmh = l2_loss::forward(outmh, y) |
| W[i,j] = old + h |
| [outph, Hout, Wout] = conv2d_depthwise::forward(X, W, b, Hin, Win, M, Hf, Wf, stride, stride, |
| pad, pad) |
| lossph = l2_loss::forward(outph, y) |
| W[i,j] = old # reset |
| dW_num = (lossph-lossmh) / (2*h) # numerical derivative |
| |
| # Check error |
| rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh) |
| } |
| } |
| |
| print(" - Grad checking b.") |
| for (i in 1:nrow(b)) { |
| for (j in 1:ncol(b)) { |
| # Compute numerical derivative |
| old = as.scalar(b[i,j]) |
| b[i,j] = old - h |
| [outmh, Hout, Wout] = conv2d_depthwise::forward(X, W, b, Hin, Win, M, Hf, Wf, stride, stride, |
| pad, pad) |
| lossmh = l2_loss::forward(outmh, y) |
| b[i,j] = old + h |
| [outph, Hout, Wout] = conv2d_depthwise::forward(X, W, b, Hin, Win, M, Hf, Wf, stride, stride, |
| pad, pad) |
| lossph = l2_loss::forward(outph, y) |
| b[i,j] = old # reset |
| db_num = (lossph-lossmh) / (2*h) # numerical derivative |
| |
| # Check error |
| rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh) |
| } |
| } |
| } |
| |
| conv2d_transpose = function() { |
| /* |
| * Gradient check for the 2D transpose convolutional layer. |
| */ |
| print("Grad checking the 2D transpose convolutional layer with L2 loss.") |
| |
| # Generate data |
| N = 2 # num examples |
| C = 2 # num channels |
| Hin = 3 # input height |
| Win = 3 # input width |
| F = 2 # num filters |
| Hf = 3 # filter height |
| Wf = 3 # filter width |
| stride = 2 |
| pad = 1 |
| out_pad = 1 |
| X = rand(rows=N, cols=C*Hin*Win) |
| |
| # Create layers |
| [W, b] = conv2d_transpose::init(F, C, Hf, Wf) |
| |
| # Compute analytical gradients of loss wrt parameters |
| [out, Hout, Wout] = conv2d_transpose::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, |
| pad, pad, out_pad, out_pad) |
| y = rand(rows=N, cols=F*Hout*Wout) |
| dout = l2_loss::backward(out,y) |
| [dX, dW, db] = conv2d_transpose::backward(dout, Hout, Wout, X, W, b, C, Hin, Win, Hf, Wf, |
| stride, stride, pad, pad) |
| |
| # Grad check |
| h = 1e-5 |
| print(" - Grad checking X.") |
| for (i in 1:nrow(X)) { |
| for (j in 1:ncol(X)) { |
| # Compute numerical derivative |
| old = as.scalar(X[i,j]) |
| X[i,j] = old - h |
| [outmh, Hout, Wout] = conv2d_transpose::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, |
| pad, pad, out_pad, out_pad) |
| lossmh = l2_loss::forward(outmh, y) |
| X[i,j] = old + h |
| [outph, Hout, Wout] = conv2d_transpose::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, |
| pad, pad, out_pad, out_pad) |
| lossph = l2_loss::forward(outph, y) |
| X[i,j] = old # reset |
| dX_num = (lossph-lossmh) / (2*h) # numerical derivative |
| |
| # Check error |
| rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh) |
| } |
| } |
| |
| print(" - Grad checking W.") |
| for (i in 1:nrow(W)) { |
| for (j in 1:ncol(W)) { |
| # Compute numerical derivative |
| old = as.scalar(W[i,j]) |
| W[i,j] = old - h |
| [outmh, Hout, Wout] = conv2d_transpose::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, |
| pad, pad, out_pad, out_pad) |
| lossmh = l2_loss::forward(outmh, y) |
| W[i,j] = old + h |
| [outph, Hout, Wout] = conv2d_transpose::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, |
| pad, pad, out_pad, out_pad) |
| lossph = l2_loss::forward(outph, y) |
| W[i,j] = old # reset |
| dW_num = (lossph-lossmh) / (2*h) # numerical derivative |
| |
| # Check error |
| rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh) |
| } |
| } |
| |
| print(" - Grad checking b.") |
| for (i in 1:nrow(b)) { |
| for (j in 1:ncol(b)) { |
| # Compute numerical derivative |
| old = as.scalar(b[i,j]) |
| b[i,j] = old - h |
| [outmh, Hout, Wout] = conv2d_transpose::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, |
| pad, pad, out_pad, out_pad) |
| lossmh = l2_loss::forward(outmh, y) |
| b[i,j] = old + h |
| [outph, Hout, Wout] = conv2d_transpose::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, |
| pad, pad, out_pad, out_pad) |
| lossph = l2_loss::forward(outph, y) |
| b[i,j] = old # reset |
| db_num = (lossph-lossmh) / (2*h) # numerical derivative |
| |
| # Check error |
| rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh) |
| } |
| } |
| } |
| |
| conv2d_transpose_depthwise = function() { |
| /* |
| * Gradient check for the 2D depthwise transpose convolutional layer. |
| */ |
| print("Grad checking the 2D depthwise transpose convolutional layer with L2 loss.") |
| |
| # Generate data |
| N = 2 # num examples |
| C = 8 # num channels |
| Hin = 3 # input height |
| Win = 3 # input width |
| M = 4 # depth of filters |
| Hf = 3 # filter height |
| Wf = 3 # filter width |
| stride = 2 |
| pad = 1 |
| out_pad = 1 |
| X = rand(rows=N, cols=C*Hin*Win) |
| |
| # Create layers |
| [W, b] = conv2d_transpose_depthwise::init(C, M, Hf, Wf) |
| |
| # Compute analytical gradients of loss wrt parameters |
| [out, Hout, Wout] = conv2d_transpose_depthwise::forward(X, W, b, C, Hin, Win, M, Hf, Wf, |
| stride, stride, pad, pad, |
| out_pad, out_pad) |
| y = rand(rows=N, cols=C/M*Hout*Wout) |
| dout = l2_loss::backward(out,y) |
| [dX, dW, db] = conv2d_transpose_depthwise::backward(dout, Hout, Wout, X, W, b, C, Hin, Win, M, |
| Hf, Wf, stride, stride, pad, pad) |
| |
| # Grad check |
| h = 1e-5 |
| print(" - Grad checking X.") |
| for (i in 1:nrow(X)) { |
| for (j in 1:ncol(X)) { |
| # Compute numerical derivative |
| old = as.scalar(X[i,j]) |
| X[i,j] = old - h |
| [outmh, Hout, Wout] = conv2d_transpose_depthwise::forward(X, W, b, C, Hin, Win, M, Hf, Wf, |
| stride, stride, pad, pad, |
| out_pad, out_pad) |
| lossmh = l2_loss::forward(outmh, y) |
| X[i,j] = old + h |
| [outph, Hout, Wout] = conv2d_transpose_depthwise::forward(X, W, b, C, Hin, Win, M, Hf, Wf, |
| stride, stride, pad, pad, |
| out_pad, out_pad) |
| lossph = l2_loss::forward(outph, y) |
| X[i,j] = old # reset |
| dX_num = (lossph-lossmh) / (2*h) # numerical derivative |
| |
| # Check error |
| rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh) |
| } |
| } |
| |
| print(" - Grad checking W.") |
| for (i in 1:nrow(W)) { |
| for (j in 1:ncol(W)) { |
| # Compute numerical derivative |
| old = as.scalar(W[i,j]) |
| W[i,j] = old - h |
| [outmh, Hout, Wout] = conv2d_transpose_depthwise::forward(X, W, b, C, Hin, Win, M, Hf, Wf, |
| stride, stride, pad, pad, |
| out_pad, out_pad) |
| lossmh = l2_loss::forward(outmh, y) |
| W[i,j] = old + h |
| [outph, Hout, Wout] = conv2d_transpose_depthwise::forward(X, W, b, C, Hin, Win, M, Hf, Wf, |
| stride, stride, pad, pad, |
| out_pad, out_pad) |
| lossph = l2_loss::forward(outph, y) |
| W[i,j] = old # reset |
| dW_num = (lossph-lossmh) / (2*h) # numerical derivative |
| |
| # Check error |
| rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh) |
| } |
| } |
| |
| print(" - Grad checking b.") |
| for (i in 1:nrow(b)) { |
| for (j in 1:ncol(b)) { |
| # Compute numerical derivative |
| old = as.scalar(b[i,j]) |
| b[i,j] = old - h |
| [outmh, Hout, Wout] = conv2d_transpose_depthwise::forward(X, W, b, C, Hin, Win, M, Hf, Wf, |
| stride, stride, pad, pad, |
| out_pad, out_pad) |
| lossmh = l2_loss::forward(outmh, y) |
| b[i,j] = old + h |
| [outph, Hout, Wout] = conv2d_transpose_depthwise::forward(X, W, b, C, Hin, Win, M, Hf, Wf, |
| stride, stride, pad, pad, |
| out_pad, out_pad) |
| lossph = l2_loss::forward(outph, y) |
| b[i,j] = old # reset |
| db_num = (lossph-lossmh) / (2*h) # numerical derivative |
| |
| # Check error |
| rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh) |
| } |
| } |
| } |
| |
| cross_entropy_loss = function() { |
| /* |
| * Gradient check for the cross-entropy loss function. |
| */ |
| print("Grad checking the cross-entropy loss function.") |
| |
| # Generate data |
| N = 3 # num examples |
| K = 10 # num targets |
| pred = rand(rows=N, cols=K, min=0, max=1, pdf="uniform") |
| pred = softmax::forward(pred) # normalized probs |
| y = rand(rows=N, cols=K, min=0, max=1, pdf="uniform") |
| y = softmax::forward(y) # normalized probs |
| |
| # Compute analytical gradient |
| dpred = cross_entropy_loss::backward(pred, y) |
| |
| # Grad check |
| h = 1e-5 |
| for (i in 1:nrow(pred)) { |
| for (j in 1:ncol(pred)) { |
| # Compute numerical derivative |
| old = as.scalar(pred[i,j]) |
| pred[i,j] = old - h |
| lossmh = cross_entropy_loss::forward(pred, y) |
| pred[i,j] = old + h |
| lossph = cross_entropy_loss::forward(pred, y) |
| pred[i,j] = old # reset W[i,j] |
| dpred_num = (lossph-lossmh) / (2*h) # numerical derivative |
| |
| # Check error |
| rel_error = test_util::check_rel_grad_error(as.scalar(dpred[i,j]), dpred_num, lossph, lossmh) |
| } |
| } |
| } |
| |
| cross_entropy_loss2d = function() { |
| /* |
| * Gradient check for the 2D cross-entropy loss function. |
| */ |
| print("Grad checking the 2D cross-entropy loss function.") |
| |
| # Generate data |
| N = 3 # num examples |
| C = 10 # num targets |
| Hin = 5 # example height |
| Win = 5 # example width |
| pred = rand(rows=N, cols=C*Hin*Win, min=0, max=1, pdf="uniform") |
| pred = softmax2d::forward(pred, C) # normalized probs |
| |
| y = rand(rows=N, cols=C*Hin*Win, min=0, max=1, pdf="uniform") |
| y = softmax2d::forward(y, C) # normalized probs |
| |
| # Compute analytical gradient |
| dpred = cross_entropy_loss2d::backward(pred, y, C) |
| |
| # Grad check |
| h = 1e-6 |
| for (i in 1:nrow(pred)) { |
| for (j in 1:ncol(pred)) { |
| # Compute numerical derivative |
| old = as.scalar(pred[i,j]) |
| pred[i,j] = old - h |
| lossmh = cross_entropy_loss2d::forward(pred, y, C) |
| pred[i,j] = old + h |
| lossph = cross_entropy_loss2d::forward(pred, y, C) |
| pred[i,j] = old # reset W[i,j] |
| dpred_num = (lossph-lossmh) / (2*h) # numerical derivative |
| |
| # Check error |
| rel_error = test_util::check_rel_grad_error(as.scalar(dpred[i,j]), dpred_num, lossph, lossmh) |
| } |
| } |
| } |
| |
| dropout = function() { |
| /* |
| * Gradient check for the (inverted) dropout layer. |
| */ |
| print("Grad checking the (inverted) dropout layer with L2 loss.") |
| |
| # Generate data |
| N = 3 # num examples |
| M = 100 # num neurons |
| p = 0.5 # probability of dropping neuron output |
| seed = as.integer(floor(as.scalar(rand(rows=1, cols=1, min=1, max=100000)))) # random seed |
| X = rand(rows=N, cols=M) |
| y = rand(rows=N, cols=M) |
| |
| # Compute analytical gradients of loss wrt parameters |
| [out, mask] = dropout::forward(X, p, seed) |
| dout = l2_loss::backward(out, y) |
| dX = dropout::backward(dout, X, p, mask) |
| |
| # Grad check |
| h = 1e-5 |
| for (i in 1:nrow(X)) { |
| for (j in 1:ncol(X)) { |
| # Compute numerical derivative |
| old = as.scalar(X[i,j]) |
| X[i,j] = old - h |
| [outmh, mask] = dropout::forward(X, p, seed) |
| lossmh = l2_loss::forward(outmh, y) |
| X[i,j] = old + h |
| [outph, mask] = dropout::forward(X, p, seed) |
| lossph = l2_loss::forward(outph, y) |
| X[i,j] = old # reset |
| dX_num = (lossph-lossmh) / (2*h) # numerical derivative |
| |
| # Check error |
| rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh) |
| } |
| } |
| } |
| |
| fm = function() { |
| /* |
| * Gradient check for the factorization machines. |
| */ |
| print("Grad checking the factorization machines with L2 loss.") |
| |
| # Generate data |
| n = 5# num examples |
| d = 100 # num features |
| k = 2 # factorization dimensionality |
| X = rand(rows=n, cols=d) |
| y = rand(rows=n, cols=1) |
| [w0, W, V] = fm::init(d, k) |
| |
| # Compute analytical gradients of loss wrt parameters |
| out = fm::forward(X, w0, W, V) |
| dout = l2_loss::backward(out, y) |
| [dw0, dW, dV] = fm::backward(dout, X, w0, W, V) |
| |
| # Grad check |
| h = 1e-5 |
| |
| print(" - Grad checking w0.") |
| for (i in 1:nrow(w0)) { |
| for (j in 1:ncol(w0)) { |
| # Compute numerical derivative |
| old = as.scalar(w0[i,j]) |
| w0[i,j] = old - h # h = 1e-5 |
| outmh = fm::forward(X, w0, W, V) |
| lossmh = l2_loss::forward(outmh, y) |
| w0[i,j] = old + h # h = 1e-5 |
| outph = fm::forward(X, w0, W, V) |
| lossph = l2_loss::forward(outph, y) |
| w0[i,j] = old # reset |
| dw0_num = (lossph-lossmh) / (2*h) # numerical derivative |
| |
| # Check error |
| rel_error = test_util::check_rel_grad_error(as.scalar(dw0[i,j]), dw0_num, lossph, lossmh) |
| } |
| } |
| |
| print(" - Grad checking W.") |
| for (i in 1:nrow(W)) { |
| for (j in 1:ncol(W)) { |
| # Compute numerical derivative |
| old = as.scalar(W[i,j]) |
| W[i,j] = old - h # h = 1e-5 |
| outmh = fm::forward(X, w0, W, V) |
| lossmh = l2_loss::forward(outmh, y) |
| W[i,j] = old + h # h = 1e-5 |
| outph = fm::forward(X, w0, W, V) |
| lossph = l2_loss::forward(outph, y) |
| W[i,j] = old # reset |
| dW_num = (lossph-lossmh) / (2*h) # numerical derivative |
| |
| # Check error |
| rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh) |
| } |
| } |
| |
| print(" - Grad checking V.") |
| for (i in 1:nrow(V)) { |
| for(i in 1:ncol(V)) { |
| # Compute numerical derivative |
| old = as.scalar(V[i,j]) |
| V[i,j] = old - h # h = 1e-5 |
| outmh = fm::forward(X, w0, W, V) |
| lossmh = l2_loss::forward(outmh, y) |
| V[i,j] = old + h # h= 1e-5 |
| outph = fm::forward(X, w0, W, V) |
| lossph = l2_loss::forward(outph, y) |
| V[i,j] = old # reset |
| dV_num = (lossph-lossmh) / (2*h) # numerical derivative |
| |
| # Check error |
| rel_error = test_util::check_rel_grad_error(as.scalar(dV[i,j]), dV_num, lossph, lossmh) |
| } |
| } |
| } |
| |
| l1_loss = function() { |
| /* |
| * Gradient check for the L1 loss function. |
| */ |
| print("Grad checking the L1 loss function.") |
| |
| # Generate data |
| N = 3 # num examples |
| D = 2 # num targets |
| pred = rand(rows=N, cols=D) |
| y = rand(rows=N, cols=D) |
| |
| # Compute analytical gradient |
| dpred = l1_loss::backward(pred, y) |
| |
| # Grad check |
| h = 1e-5 |
| for (i in 1:nrow(pred)) { |
| for (j in 1:ncol(pred)) { |
| # Compute numerical derivative |
| old = as.scalar(pred[i,j]) |
| pred[i,j] = old - h |
| lossmh = l1_loss::forward(pred, y) |
| pred[i,j] = old + h |
| lossph = l1_loss::forward(pred, y) |
| pred[i,j] = old # reset W[i,j] |
| dpred_num = (lossph-lossmh) / (2*h) # numerical derivative |
| |
| # Check error |
| rel_error = test_util::check_rel_grad_error(as.scalar(dpred[i,j]), dpred_num, lossph, lossmh) |
| } |
| } |
| } |
| |
| l1_reg = function() { |
| /* |
| * Gradient check for the L1 regularization function. |
| */ |
| print("Grad checking the L1 regularization function.") |
| |
| # Generate data |
| D = 5 # num features |
| M = 3 # num neurons |
| lambda = 0.01 |
| W = rand(rows=D, cols=M) |
| |
| # Compute analytical gradient |
| dW = l1_reg::backward(W, lambda) |
| |
| # Grad check |
| h = 1e-5 |
| for (i in 1:nrow(W)) { |
| for (j in 1:ncol(W)) { |
| # Compute numerical derivative |
| old = as.scalar(W[i,j]) |
| W[i,j] = old - h |
| reg_lossmh = l1_reg::forward(W, lambda) |
| W[i,j] = old + h |
| reg_lossph = l1_reg::forward(W, lambda) |
| W[i,j] = old # reset W[i,j] |
| dW_num = (reg_lossph-reg_lossmh) / (2*h) # numerical derivative |
| |
| # Check error |
| rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, |
| reg_lossph, reg_lossmh) |
| } |
| } |
| } |
| |
| l2_loss = function() { |
| /* |
| * Gradient check for the L2 loss function. |
| */ |
| print("Grad checking the L2 loss function.") |
| |
| # Generate data |
| N = 3 # num examples |
| D = 2 # num targets |
| pred = rand(rows=N, cols=D) |
| y = rand(rows=N, cols=D) |
| |
| # Compute analytical gradient |
| dpred = l2_loss::backward(pred, y) |
| |
| # Grad check |
| h = 1e-5 |
| for (i in 1:nrow(pred)) { |
| for (j in 1:ncol(pred)) { |
| # Compute numerical derivative |
| old = as.scalar(pred[i,j]) |
| pred[i,j] = old - h |
| lossmh = l2_loss::forward(pred, y) |
| pred[i,j] = old + h |
| lossph = l2_loss::forward(pred, y) |
| pred[i,j] = old # reset W[i,j] |
| dpred_num = (lossph-lossmh) / (2*h) # numerical derivative |
| |
| # Check error |
| rel_error = test_util::check_rel_grad_error(as.scalar(dpred[i,j]), dpred_num, lossph, lossmh) |
| } |
| } |
| } |
| |
| l2_reg = function() { |
| /* |
| * Gradient check for the L2 regularization function. |
| */ |
| print("Grad checking the L2 regularization function.") |
| |
| # Generate data |
| D = 5 # num features |
| M = 3 # num neurons |
| lambda = 0.01 |
| W = rand(rows=D, cols=M) |
| |
| # Compute analytical gradient |
| dW = l2_reg::backward(W, lambda) |
| |
| # Grad check |
| h = 1e-5 |
| for (i in 1:nrow(W)) { |
| for (j in 1:ncol(W)) { |
| # Compute numerical derivative |
| old = as.scalar(W[i,j]) |
| W[i,j] = old - h |
| reg_lossmh = l2_reg::forward(W, lambda) |
| W[i,j] = old + h |
| reg_lossph = l2_reg::forward(W, lambda) |
| W[i,j] = old # reset W[i,j] |
| dW_num = (reg_lossph-reg_lossmh) / (2*h) # numerical derivative |
| |
| # Check error |
| rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, |
| reg_lossph, reg_lossmh) |
| } |
| } |
| } |
| |
| log_loss = function() { |
| /* |
| * Gradient check for the log loss function. |
| */ |
| print("Grad checking the log loss function.") |
| |
| # Generate data |
| N = 20 # num examples |
| D = 1 # num targets |
| pred = rand(rows=N, cols=D, min=0, max=1, pdf="uniform") |
| y = round(rand(rows=N, cols=D, min=0, max=1, pdf="uniform")) |
| |
| # Compute analytical gradient |
| dpred = log_loss::backward(pred, y) |
| |
| # Grad check |
| h = 1e-5 |
| for (i in 1:nrow(pred)) { |
| for (j in 1:ncol(pred)) { |
| # Compute numerical derivative |
| old = as.scalar(pred[i,j]) |
| pred[i,j] = old - h |
| lossmh = log_loss::forward(pred, y) |
| pred[i,j] = old + h |
| lossph = log_loss::forward(pred, y) |
| pred[i,j] = old # reset W[i,j] |
| dpred_num = (lossph-lossmh) / (2*h) # numerical derivative |
| |
| # Check error |
| rel_error = test_util::check_rel_grad_error(as.scalar(dpred[i,j]), dpred_num, lossph, lossmh) |
| } |
| } |
| } |
| |
| lstm = function() { |
| /* |
| * Gradient check for the LSTM layer. |
| */ |
| print("Grad checking the LSTM layer with L2 loss.") |
| |
| # Generate data |
| N = 3 # num examples |
| D = 5 # num features |
| T = 15 # num timesteps (sequence length) |
| M = 10 # num neurons |
| X = rand(rows=N, cols=T*D) |
| yc = rand(rows=N, cols=M) |
| out0 = rand(rows=N, cols=M) |
| c0 = rand(rows=N, cols=M) |
| [W, b, dummy, dummy2] = lstm::init(N, D, M) |
| |
| # test with (1) outputs from all timesteps, and (2) output from the final timestep |
| for (i in 1:2) { |
| if (i == 1) { |
| return_seq = TRUE |
| y = rand(rows=N, cols=T*M) |
| } |
| else { |
| return_seq = FALSE |
| y = rand(rows=N, cols=M) |
| } |
| |
| print(" - Grad checking with return_seq = " + return_seq) |
| |
| # Compute analytical gradients of loss wrt parameters |
| [out, c, cache_out, cache_c, cache_ifog] = lstm::forward(X, W, b, T, D, return_seq, out0, c0) |
| dout = l2_loss::backward(out, y) |
| dc = l2_loss::backward(c, yc) |
| [dX, dW, db, dout0, dc0] = lstm::backward(dout, dc, X, W, b, T, D, return_seq, out0, c0, |
| cache_out, cache_c, cache_ifog) |
| |
| # Grad check |
| h = 1e-5 |
| print(" - Grad checking X.") |
| for (i in 1:nrow(X)) { |
| for (j in 1:ncol(X)) { |
| # Compute numerical derivative |
| old = as.scalar(X[i,j]) |
| X[i,j] = old - h |
| [outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0) |
| loss_outmh = l2_loss::forward(outmh, y) |
| loss_cmh = l2_loss::forward(cmh, yc) |
| lossmh = loss_outmh + loss_cmh |
| X[i,j] = old + h |
| [outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0) |
| loss_outph = l2_loss::forward(outph, y) |
| loss_cph = l2_loss::forward(cph, yc) |
| lossph = loss_outph + loss_cph |
| X[i,j] = old # reset |
| dX_num = (lossph-lossmh) / (2*h) # numerical derivative |
| |
| # Check error |
| rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh) |
| } |
| } |
| |
| print(" - Grad checking W.") |
| for (i in 1:nrow(W)) { |
| for (j in 1:ncol(W)) { |
| # Compute numerical derivative |
| old = as.scalar(W[i,j]) |
| W[i,j] = old - h |
| [outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0) |
| loss_outmh = l2_loss::forward(outmh, y) |
| loss_cmh = l2_loss::forward(cmh, yc) |
| lossmh = loss_outmh + loss_cmh |
| W[i,j] = old + h |
| [outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0) |
| loss_outph = l2_loss::forward(outph, y) |
| loss_cph = l2_loss::forward(cph, yc) |
| lossph = loss_outph + loss_cph |
| W[i,j] = old # reset |
| dW_num = (lossph-lossmh) / (2*h) # numerical derivative |
| |
| # Check error |
| rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh) |
| } |
| } |
| |
| print(" - Grad checking b.") |
| for (i in 1:nrow(b)) { |
| for (j in 1:ncol(b)) { |
| # Compute numerical derivative |
| old = as.scalar(b[i,j]) |
| b[i,j] = old - h |
| [outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0) |
| loss_outmh = l2_loss::forward(outmh, y) |
| loss_cmh = l2_loss::forward(cmh, yc) |
| lossmh = loss_outmh + loss_cmh |
| b[i,j] = old + h |
| [outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0) |
| loss_outph = l2_loss::forward(outph, y) |
| loss_cph = l2_loss::forward(cph, yc) |
| lossph = loss_outph + loss_cph |
| b[i,j] = old # reset |
| db_num = (lossph-lossmh) / (2*h) # numerical derivative |
| |
| # Check error |
| rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh) |
| } |
| } |
| |
| print(" - Grad checking out0.") |
| for (i in 1:nrow(out0)) { |
| for (j in 1:ncol(out0)) { |
| # Compute numerical derivative |
| old = as.scalar(out0[i,j]) |
| out0[i,j] = old - h |
| [outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0) |
| loss_outmh = l2_loss::forward(outmh, y) |
| loss_cmh = l2_loss::forward(cmh, yc) |
| lossmh = loss_outmh + loss_cmh |
| out0[i,j] = old + h |
| [outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0) |
| loss_outph = l2_loss::forward(outph, y) |
| loss_cph = l2_loss::forward(cph, yc) |
| lossph = loss_outph + loss_cph |
| out0[i,j] = old # reset |
| dout0_num = (lossph-lossmh) / (2*h) # numerical derivative |
| |
| # Check error |
| rel_error = test_util::check_rel_grad_error(as.scalar(dout0[i,j]), dout0_num, lossph, lossmh) |
| } |
| } |
| |
| print(" - Grad checking c0.") |
| for (i in 1:nrow(c0)) { |
| for (j in 1:ncol(c0)) { |
| # Compute numerical derivative |
| old = as.scalar(c0[i,j]) |
| c0[i,j] = old - h |
| [outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0) |
| loss_outmh = l2_loss::forward(outmh, y) |
| loss_cmh = l2_loss::forward(cmh, yc) |
| lossmh = loss_outmh + loss_cmh |
| c0[i,j] = old + h |
| [outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0) |
| loss_outph = l2_loss::forward(outph, y) |
| loss_cph = l2_loss::forward(cph, yc) |
| lossph = loss_outph + loss_cph |
| c0[i,j] = old # reset |
| dc0_num = (lossph-lossmh) / (2*h) # numerical derivative |
| |
| # Check error |
| rel_error = test_util::check_rel_grad_error(as.scalar(dc0[i,j]), dc0_num, lossph, lossmh) |
| } |
| } |
| } |
| } |
| |
| max_pool2d = function() { |
| /* |
| * Gradient check for the 2D max pooling layer. |
| */ |
| print("Grad checking the 2D max pooling layer with L2 loss.") |
| |
| # Generate data |
| N = 2 # num examples |
| C = 2 # num channels |
| Hin = 4 # input height |
| Win = 4 # input width |
| Hf = 2 # pool filter height |
| Wf = 2 # pool filter width |
| stride = 2 |
| X = rand(rows=N, cols=C*Hin*Win) |
| |
| for (pad in 0:1) { |
| print(" - Grad checking w/ pad="+pad+".") |
| Hout = as.integer(floor((Hin + 2*pad - Hf)/stride + 1)) |
| Wout = as.integer(floor((Win + 2*pad - Wf)/stride + 1)) |
| y = rand(rows=N, cols=C*Hout*Wout) |
| |
| # Compute analytical gradients of loss wrt parameters |
| [out, Hout, Wout] = max_pool2d::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad) |
| dout = l2_loss::backward(out, y) |
| dX = max_pool2d::backward(dout, Hout, Wout, X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad) |
| |
| # Grad check |
| h = 1e-5 |
| for (i in 1:nrow(X)) { |
| for (j in 1:ncol(X)) { |
| # Compute numerical derivative |
| old = as.scalar(X[i,j]) |
| X[i,j] = old - h |
| [outmh, Hout, Wout] = max_pool2d::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad) |
| lossmh = l2_loss::forward(outmh, y) |
| X[i,j] = old + h |
| [outph, Hout, Wout] = max_pool2d::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad) |
| lossph = l2_loss::forward(outph, y) |
| X[i,j] = old # reset |
| dX_num = (lossph-lossmh) / (2*h) # numerical derivative |
| |
| # Check error |
| rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh) |
| } |
| } |
| } |
| } |
| |
| max_pool2d_builtin = function() { |
| /* |
| * Gradient check for the 2D max pooling layer. |
| */ |
| print("Grad checking the built-in 2D max pooling layer with L2 loss.") |
| |
| # Generate data |
| N = 2 # num examples |
| C = 2 # num channels |
| Hin = 4 # input height |
| Win = 4 # input width |
| Hf = 2 # pool filter height |
| Wf = 2 # pool filter width |
| stride = 2 |
| X = rand(rows=N, cols=C*Hin*Win) |
| |
| for (pad in 0:1) { |
| print(" - Grad checking w/ pad="+pad+".") |
| Hout = as.integer(floor((Hin + 2 * pad - Hf) / stride + 1)) |
| Wout = as.integer(floor((Win + 2 * pad - Wf) / stride + 1)) |
| y = rand(rows=N, cols=C*Hout*Wout) |
| |
| # Compute analytical gradients of loss wrt parameters |
| [out, Hout, Wout] = max_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf, stride, stride, |
| pad, pad) |
| dout = l2_loss::backward(out, y) |
| dX = max_pool2d_builtin::backward(dout, Hout, Wout, X, C, Hin, Win, Hf, Wf, stride, stride, |
| pad, pad) |
| |
| # Grad check |
| h = 1e-5 |
| for (i in 1:nrow(X)) { |
| for (j in 1:ncol(X)) { |
| # Compute numerical derivative |
| old = as.scalar(X[i,j]) |
| X[i,j] = old - h |
| [outmh, Hout, Wout] = max_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf, stride, stride, |
| pad, pad) |
| lossmh = l2_loss::forward(outmh, y) |
| X[i,j] = old + h |
| [outph, Hout, Wout] = max_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf, stride, stride, |
| pad, pad) |
| lossph = l2_loss::forward(outph, y) |
| X[i,j] = old # reset |
| dX_num = (lossph-lossmh) / (2*h) # numerical derivative |
| |
| # Check error |
| rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh) |
| } |
| } |
| } |
| } |
| |
| avg_pool2d_builtin = function() { |
| /* |
| * Gradient check for the 2D avg pooling layer. |
| */ |
| print("Grad checking the built-in 2D avg pooling layer with L2 loss.") |
| |
| # Generate data |
| N = 2 # num examples |
| C = 2 # num channels |
| Hin = 4 # input height |
| Win = 4 # input width |
| Hf = 2 # pool filter height |
| Wf = 2 # pool filter width |
| stride = 2 |
| X = rand(rows=N, cols=C*Hin*Win) |
| |
| for (pad in 0:1) { |
| print(" - Grad checking w/ pad="+pad+".") |
| Hout = as.integer(floor((Hin + 2 * pad - Hf) / stride + 1)) |
| Wout = as.integer(floor((Win + 2 * pad - Wf) / stride + 1)) |
| y = rand(rows=N, cols=C*Hout*Wout) |
| |
| # Compute analytical gradients of loss wrt parameters |
| [out, Hout, Wout] = avg_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf, stride, stride, |
| pad, pad) |
| dout = l2_loss::backward(out, y) |
| dX = avg_pool2d_builtin::backward(dout, Hout, Wout, X, C, Hin, Win, Hf, Wf, stride, stride, |
| pad, pad) |
| |
| # Grad check |
| h = 1e-5 |
| for (i in 1:nrow(X)) { |
| for (j in 1:ncol(X)) { |
| # Compute numerical derivative |
| old = as.scalar(X[i,j]) |
| X[i,j] = old - h |
| [outmh, Hout, Wout] = avg_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf, stride, stride, |
| pad, pad) |
| lossmh = l2_loss::forward(outmh, y) |
| X[i,j] = old + h |
| [outph, Hout, Wout] = avg_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf, stride, stride, |
| pad, pad) |
| lossph = l2_loss::forward(outph, y) |
| X[i,j] = old # reset |
| dX_num = (lossph-lossmh) / (2*h) # numerical derivative |
| |
| # Check error |
| rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh) |
| } |
| } |
| } |
| } |
| |
| |
| max_pool2d_simple = function() { |
| /* |
| * Gradient check for the simple reference 2D max pooling layer. |
| */ |
| print("Grad checking the simple reference 2D max pooling layer with L2 loss.") |
| |
| # Generate data |
| N = 2 # num examples |
| C = 2 # num channels |
| Hin = 4 # input height |
| Win = 4 # input width |
| Hf = 2 # pool filter height |
| Wf = 2 # pool filter width |
| stride = 2 |
| X = rand(rows=N, cols=C*Hin*Win) |
| |
| for (pad in 0:1) { |
| print(" - Grad checking w/ pad="+pad+".") |
| Hout = as.integer(floor((Hin + 2*pad - Hf)/stride + 1)) |
| Wout = as.integer(floor((Win + 2*pad - Wf)/stride + 1)) |
| y = rand(rows=N, cols=C*Hout*Wout) |
| |
| # Compute analytical gradients of loss wrt parameters |
| [out, Hout, Wout] = max_pool2d_simple::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad) |
| dout = l2_loss::backward(out, y) |
| dX = max_pool2d_simple::backward(dout, Hout, Wout, X, C, Hin, Win, Hf, Wf, stride, stride, |
| pad, pad) |
| |
| # Grad check |
| h = 1e-5 |
| for (i in 1:nrow(X)) { |
| for (j in 1:ncol(X)) { |
| # Compute numerical derivative |
| old = as.scalar(X[i,j]) |
| X[i,j] = old - h |
| [outmh, Hout, Wout] = max_pool2d_simple::forward(X, C, Hin, Win, Hf, Wf, stride, stride, |
| pad, pad) |
| lossmh = l2_loss::forward(outmh, y) |
| X[i,j] = old + h |
| [outph, Hout, Wout] = max_pool2d_simple::forward(X, C, Hin, Win, Hf, Wf, stride, stride, |
| pad, pad) |
| lossph = l2_loss::forward(outph, y) |
| X[i,j] = old # reset |
| dX_num = (lossph-lossmh) / (2*h) # numerical derivative |
| |
| # Check error |
| rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh) |
| } |
| } |
| } |
| } |
| |
| upsample2d = function() { |
| print("Grad checking the upsample2d layer with L2 loss.") |
| |
| C=2; Hin=3; Win=3; size_h=2; size_w=2 |
| # Generate data |
| N = 3 # num examples |
| M = C*Hin*Win # num neurons |
| X = rand(rows=N, cols=M, min=-5, max=5) |
| y = rand(rows=N, cols=M*size_h*size_w) |
| |
| # Compute analytical gradients of loss wrt parameters |
| out = upsample2d::forward(X, C, Hin, Win, size_h, size_w) |
| dout = l2_loss::backward(out, y) |
| dX = upsample2d::backward(dout, C, Hin, Win, size_h, size_w) |
| |
| # Grad check |
| h = 1e-5 |
| for (i in 1:nrow(X)) { |
| for (j in 1:ncol(X)) { |
| # Compute numerical derivative |
| old = as.scalar(X[i,j]) |
| X[i,j] = old - h |
| outmh = upsample2d::forward(X, C, Hin, Win, size_h, size_w) |
| lossmh = l2_loss::forward(outmh, y) |
| X[i,j] = old + h |
| outph = upsample2d::forward(X, C, Hin, Win, size_h, size_w) |
| lossph = l2_loss::forward(outph, y) |
| X[i,j] = old # reset |
| dX_num = (lossph-lossmh) / (2*h) # numerical derivative |
| |
| # Check error |
| rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh) |
| } |
| } |
| } |
| |
| relu = function() { |
| /* |
| * Gradient check for the ReLU nonlinearity layer. |
| * |
| * NOTE: This could result in a false-negative in which the test |
| * fails due to a kink being crossed in the nonlinearity. This |
| * occurs when the tests, f(x-h) and f(x+h), end up on opposite |
| * sides of the zero threshold of max(0, fx). For now, just run |
| * the tests again. In the future, we can explicitly check for |
| * this and rerun the test automatically. |
| */ |
| print("Grad checking the ReLU nonlinearity layer with L2 loss.") |
| |
| # Generate data |
| N = 3 # num examples |
| M = 10 # num neurons |
| X = rand(rows=N, cols=M, min=-5, max=5) |
| y = rand(rows=N, cols=M) |
| |
| # Compute analytical gradients of loss wrt parameters |
| out = relu::forward(X) |
| dout = l2_loss::backward(out, y) |
| dX = relu::backward(dout, X) |
| |
| # Grad check |
| h = 1e-5 |
| for (i in 1:nrow(X)) { |
| for (j in 1:ncol(X)) { |
| # Compute numerical derivative |
| old = as.scalar(X[i,j]) |
| X[i,j] = old - h |
| outmh = relu::forward(X) |
| lossmh = l2_loss::forward(outmh, y) |
| X[i,j] = old + h |
| outph = relu::forward(X) |
| lossph = l2_loss::forward(outph, y) |
| X[i,j] = old # reset |
| dX_num = (lossph-lossmh) / (2*h) # numerical derivative |
| |
| # Check error |
| rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh) |
| } |
| } |
| } |
| |
| rnn = function() { |
| /* |
| * Gradient check for the simple RNN layer. |
| */ |
| print("Grad checking the simple RNN layer with L2 loss.") |
| |
| # Generate data |
| N = 3 # num examples |
| D = 5 # num features |
| T = 15 # num timesteps (sequence length) |
| M = 10 # num neurons |
| X = rand(rows=N, cols=T*D) |
| out0 = rand(rows=N, cols=M) |
| [W, b, dummy] = rnn::init(N, D, M) |
| |
| # test with (1) outputs from all timesteps, and (2) output from the final timestep |
| for (i in 1:2) { |
| if (i == 1) { |
| return_seq = TRUE |
| y = rand(rows=N, cols=T*M) |
| } |
| else { |
| return_seq = FALSE |
| y = rand(rows=N, cols=M) |
| } |
| |
| print(" - Grad checking with return_seq = " + return_seq) |
| |
| # Compute analytical gradients of loss wrt parameters |
| [out, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0) |
| dout = l2_loss::backward(out, y) |
| [dX, dW, db, dout0] = rnn::backward(dout, X, W, b, T, D, return_seq, out0, cache_out) |
| |
| # Grad check |
| h = 1e-5 |
| print(" - Grad checking X.") |
| for (i in 1:nrow(X)) { |
| for (j in 1:ncol(X)) { |
| # Compute numerical derivative |
| old = as.scalar(X[i,j]) |
| X[i,j] = old - h |
| [outmh, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0) |
| lossmh = l2_loss::forward(outmh, y) |
| X[i,j] = old + h |
| [outph, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0) |
| lossph = l2_loss::forward(outph, y) |
| X[i,j] = old # reset |
| dX_num = (lossph-lossmh) / (2*h) # numerical derivative |
| |
| # Check error |
| rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh) |
| } |
| } |
| |
| print(" - Grad checking W.") |
| for (i in 1:nrow(W)) { |
| for (j in 1:ncol(W)) { |
| # Compute numerical derivative |
| old = as.scalar(W[i,j]) |
| W[i,j] = old - h |
| [outmh, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0) |
| lossmh = l2_loss::forward(outmh, y) |
| W[i,j] = old + h |
| [outph, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0) |
| lossph = l2_loss::forward(outph, y) |
| W[i,j] = old # reset |
| dW_num = (lossph-lossmh) / (2*h) # numerical derivative |
| |
| # Check error |
| rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh) |
| } |
| } |
| |
| print(" - Grad checking b.") |
| for (i in 1:nrow(b)) { |
| for (j in 1:ncol(b)) { |
| # Compute numerical derivative |
| old = as.scalar(b[i,j]) |
| b[i,j] = old - h |
| [outmh, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0) |
| lossmh = l2_loss::forward(outmh, y) |
| b[i,j] = old + h |
| [outph, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0) |
| lossph = l2_loss::forward(outph, y) |
| b[i,j] = old # reset |
| db_num = (lossph-lossmh) / (2*h) # numerical derivative |
| |
| # Check error |
| rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh) |
| } |
| } |
| |
| print(" - Grad checking out0.") |
| for (i in 1:nrow(out0)) { |
| for (j in 1:ncol(out0)) { |
| # Compute numerical derivative |
| old = as.scalar(out0[i,j]) |
| out0[i,j] = old - h |
| [outmh, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0) |
| lossmh = l2_loss::forward(outmh, y) |
| out0[i,j] = old + h |
| [outph, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0) |
| lossph = l2_loss::forward(outph, y) |
| out0[i,j] = old # reset |
| dout0_num = (lossph-lossmh) / (2*h) # numerical derivative |
| |
| # Check error |
| rel_error = test_util::check_rel_grad_error(as.scalar(dout0[i,j]), dout0_num, lossph, lossmh) |
| } |
| } |
| } |
| } |
| |
| scale_shift1d = function() { |
| /* |
| * Gradient check for the 1D scale & shift layer. |
| */ |
| print("Grad checking the 1D scale & shift layer with L2 loss.") |
| |
| # Generate data |
| N = 3 # num examples |
| D = 100 # num features |
| X = rand(rows=N, cols=D) |
| y = rand(rows=N, cols=D) |
| [gamma, beta] = scale_shift1d::init(D) |
| |
| # Compute analytical gradients of loss wrt parameters |
| out = scale_shift1d::forward(X, gamma, beta) |
| dout = l2_loss::backward(out, y) |
| [dX, dgamma, dbeta] = scale_shift1d::backward(dout, out, X, gamma, beta) |
| |
| # Grad check |
| h = 1e-5 |
| print(" - Grad checking X.") |
| for (i in 1:nrow(X)) { |
| for (j in 1:ncol(X)) { |
| # Compute numerical derivative |
| old = as.scalar(X[i,j]) |
| X[i,j] = old - h |
| outmh = scale_shift1d::forward(X, gamma, beta) |
| lossmh = l2_loss::forward(outmh, y) |
| X[i,j] = old + h |
| outph = scale_shift1d::forward(X, gamma, beta) |
| lossph = l2_loss::forward(outph, y) |
| X[i,j] = old # reset |
| dX_num = (lossph-lossmh) / (2*h) # numerical derivative |
| |
| # Check error |
| rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh) |
| } |
| } |
| |
| print(" - Grad checking gamma.") |
| for (i in 1:nrow(gamma)) { |
| for (j in 1:ncol(gamma)) { |
| # Compute numerical derivative |
| old = as.scalar(gamma[i,j]) |
| gamma[i,j] = old - h |
| outmh = scale_shift1d::forward(X, gamma, beta) |
| lossmh = l2_loss::forward(outmh, y) |
| gamma[i,j] = old + h |
| outph = scale_shift1d::forward(X, gamma, beta) |
| lossph = l2_loss::forward(outph, y) |
| gamma[i,j] = old # reset |
| dgamma_num = (lossph-lossmh) / (2*h) # numerical derivative |
| |
| # Check error |
| rel_error = test_util::check_rel_grad_error(as.scalar(dgamma[i,j]), dgamma_num, |
| lossph, lossmh) |
| } |
| } |
| |
| print(" - Grad checking beta.") |
| for (i in 1:nrow(beta)) { |
| for (j in 1:ncol(beta)) { |
| # Compute numerical derivative |
| old = as.scalar(beta[i,j]) |
| beta[i,j] = old - h |
| outmh = scale_shift1d::forward(X, gamma, beta) |
| lossmh = l2_loss::forward(outmh, y) |
| beta[i,j] = old + h |
| outph = scale_shift1d::forward(X, gamma, beta) |
| lossph = l2_loss::forward(outph, y) |
| beta[i,j] = old # reset |
| dbeta_num = (lossph-lossmh) / (2*h) # numerical derivative |
| |
| # Check error |
| rel_error = test_util::check_rel_grad_error(as.scalar(dbeta[i,j]), dbeta_num, |
| lossph, lossmh) |
| } |
| } |
| } |
| |
| scale_shift2d = function() { |
| /* |
| * Gradient check for the 2D scale & shift layer. |
| */ |
| print("Grad checking the 2D scale & shift layer with L2 loss.") |
| |
| # Generate data |
| N = 3 # num examples |
| C = 2 # num channels |
| Hin = 5 # input height |
| Win = 5 # input width |
| X = rand(rows=N, cols=C*Hin*Win) |
| y = rand(rows=N, cols=C*Hin*Win) |
| [gamma, beta] = scale_shift2d::init(C) |
| |
| # Compute analytical gradients of loss wrt parameters |
| out = scale_shift2d::forward(X, gamma, beta, C, Hin, Win) |
| dout = l2_loss::backward(out, y) |
| [dX, dgamma, dbeta] = scale_shift2d::backward(dout, out, X, gamma, beta, C, Hin, Win) |
| |
| # Grad check |
| h = 1e-5 |
| print(" - Grad checking X.") |
| for (i in 1:nrow(X)) { |
| for (j in 1:ncol(X)) { |
| # Compute numerical derivative |
| old = as.scalar(X[i,j]) |
| X[i,j] = old - h |
| outmh = scale_shift2d::forward(X, gamma, beta, C, Hin, Win) |
| lossmh = l2_loss::forward(outmh, y) |
| X[i,j] = old + h |
| outph = scale_shift2d::forward(X, gamma, beta, C, Hin, Win) |
| lossph = l2_loss::forward(outph, y) |
| X[i,j] = old # reset |
| dX_num = (lossph-lossmh) / (2*h) # numerical derivative |
| |
| # Check error |
| rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh) |
| } |
| } |
| |
| print(" - Grad checking gamma.") |
| for (i in 1:nrow(gamma)) { |
| for (j in 1:ncol(gamma)) { |
| # Compute numerical derivative |
| old = as.scalar(gamma[i,j]) |
| gamma[i,j] = old - h |
| outmh = scale_shift2d::forward(X, gamma, beta, C, Hin, Win) |
| lossmh = l2_loss::forward(outmh, y) |
| gamma[i,j] = old + h |
| outph = scale_shift2d::forward(X, gamma, beta, C, Hin, Win) |
| lossph = l2_loss::forward(outph, y) |
| gamma[i,j] = old # reset |
| dgamma_num = (lossph-lossmh) / (2*h) # numerical derivative |
| |
| # Check error |
| rel_error = test_util::check_rel_grad_error(as.scalar(dgamma[i,j]), dgamma_num, |
| lossph, lossmh) |
| } |
| } |
| |
| print(" - Grad checking beta.") |
| for (i in 1:nrow(beta)) { |
| for (j in 1:ncol(beta)) { |
| # Compute numerical derivative |
| old = as.scalar(beta[i,j]) |
| beta[i,j] = old - h |
| outmh = scale_shift2d::forward(X, gamma, beta, C, Hin, Win) |
| lossmh = l2_loss::forward(outmh, y) |
| beta[i,j] = old + h |
| outph = scale_shift2d::forward(X, gamma, beta, C, Hin, Win) |
| lossph = l2_loss::forward(outph, y) |
| beta[i,j] = old # reset |
| dbeta_num = (lossph-lossmh) / (2*h) # numerical derivative |
| |
| # Check error |
| rel_error = test_util::check_rel_grad_error(as.scalar(dbeta[i,j]), dbeta_num, |
| lossph, lossmh) |
| } |
| } |
| } |
| |
| sigmoid = function() { |
| /* |
| * Gradient check for the sigmoid nonlinearity layer. |
| */ |
| print("Grad checking the sigmoid nonlinearity layer with L2 loss.") |
| |
| # Generate data |
| N = 3 # num examples |
| M = 10 # num neurons |
| X = rand(rows=N, cols=M) |
| y = rand(rows=N, cols=M) |
| |
| # Compute analytical gradients of loss wrt parameters |
| out = sigmoid::forward(X) |
| dout = l2_loss::backward(out, y) |
| dX = sigmoid::backward(dout, X) |
| |
| # Grad check |
| h = 1e-5 |
| for (i in 1:nrow(X)) { |
| for (j in 1:ncol(X)) { |
| # Compute numerical derivative |
| old = as.scalar(X[i,j]) |
| X[i,j] = old - h |
| outmh = sigmoid::forward(X) |
| lossmh = l2_loss::forward(outmh, y) |
| X[i,j] = old + h |
| outph = sigmoid::forward(X) |
| lossph = l2_loss::forward(outph, y) |
| X[i,j] = old # reset |
| dX_num = (lossph-lossmh) / (2*h) # numerical derivative |
| |
| # Check error |
| rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh) |
| } |
| } |
| } |
| |
| softmax = function() { |
| /* |
| * Gradient check for the softmax layer. |
| */ |
| print("Grad checking the softmax layer with L2 loss.") |
| |
| # Generate data |
| N = 3 # num examples |
| D = 10 # num classes |
| X = rand(rows=N, cols=D) |
| y = rand(rows=N, cols=D, min=0, max=1, pdf="uniform") |
| y = y / rowSums(y) |
| |
| # Compute analytical gradients of loss wrt parameters |
| out = softmax::forward(X) |
| dout = l2_loss::backward(out, y) |
| dX = softmax::backward(dout, X) |
| |
| # Grad check |
| h = 1e-5 |
| for (i in 1:nrow(X)) { |
| for (j in 1:ncol(X)) { |
| # Compute numerical derivative |
| old = as.scalar(X[i,j]) |
| X[i,j] = old - h |
| outmh = softmax::forward(X) |
| lossmh = l2_loss::forward(outmh, y) |
| X[i,j] = old + h |
| outph = softmax::forward(X) |
| lossph = l2_loss::forward(outph, y) |
| X[i,j] = old # reset |
| dX_num = (lossph-lossmh) / (2*h) # numerical derivative |
| |
| # Check error |
| rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh) |
| } |
| } |
| } |
| |
| softmax2d = function() { |
| /* |
| * Gradient check for the 2D softmax layer. |
| */ |
| print("Grad checking the 2D softmax layer with L2 loss.") |
| |
| # Generate data |
| N = 3 # num examples |
| C = 10 # num classes |
| Hin = 5 # example height |
| Win = 5 # example width |
| |
| X = rand(rows=N, cols=C*Hin*Win) |
| y = rand(rows=N, cols=C*Hin*Win, min=0, max=1, pdf="uniform") |
| y_C_NHW = util::transpose_NCHW_to_CNHW(y, C) |
| y_NHW_C = t(y_C_NHW) |
| y_NHW_C = y_NHW_C / rowSums(y_NHW_C) |
| |
| # Compute analytical gradients of loss wrt parameters |
| out = softmax2d::forward(X, C) |
| out_C_NHW = util::transpose_NCHW_to_CNHW(out, C) |
| out_NHW_C = t(out_C_NHW) |
| |
| dout_NHW_C = l2_loss::backward(out_NHW_C, y_NHW_C) |
| dout_C_NHW = t(dout_NHW_C) |
| dout = util::transpose_NCHW_to_CNHW(dout_C_NHW, N) |
| dX = softmax2d::backward(dout, X, C) |
| |
| # Grad check |
| h = 1e-5 |
| for (i in 1:nrow(X)) { |
| for (j in 1:ncol(X)) { |
| # Compute numerical derivative |
| old = as.scalar(X[i,j]) |
| X[i,j] = old - h |
| outmh = softmax2d::forward(X, C) |
| outmh_C_NHW = util::transpose_NCHW_to_CNHW(outmh, C) |
| outmh_NHW_C = t(outmh_C_NHW) |
| lossmh = l2_loss::forward(outmh_NHW_C, y_NHW_C) |
| |
| X[i,j] = old + h |
| outph = softmax2d::forward(X, C) |
| outph_C_NHW = util::transpose_NCHW_to_CNHW(outph, C) |
| outph_NHW_C = t(outph_C_NHW) |
| lossph = l2_loss::forward(outph_NHW_C, y_NHW_C) |
| X[i,j] = old # reset |
| dX_num = (lossph-lossmh) / (2*h) # numerical derivative |
| |
| # Check error |
| rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh) |
| } |
| } |
| } |
| |
| tanh = function() { |
| /* |
| * Gradient check for the hyperbolic tangent (tanh) nonlinearity |
| * layer. |
| */ |
| print("Grad checking the tanh nonlinearity layer with L2 loss.") |
| |
| # Generate data |
| N = 3 # num examples |
| M = 10 # num neurons |
| X = rand(rows=N, cols=M) |
| y = rand(rows=N, cols=M) |
| |
| # Compute analytical gradients of loss wrt parameters |
| out = tanh::forward(X) |
| dout = l2_loss::backward(out, y) |
| dX = tanh::backward(dout, X) |
| |
| # Grad check |
| h = 1e-5 |
| for (i in 1:nrow(X)) { |
| for (j in 1:ncol(X)) { |
| # Compute numerical derivative |
| old = as.scalar(X[i,j]) |
| X[i,j] = old - h |
| outmh = tanh::forward(X) |
| lossmh = l2_loss::forward(outmh, y) |
| X[i,j] = old + h |
| outph = tanh::forward(X) |
| lossph = l2_loss::forward(outph, y) |
| X[i,j] = old # reset |
| dX_num = (lossph-lossmh) / (2*h) # numerical derivative |
| |
| # Check error |
| rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh) |
| } |
| } |
| } |
| |
| two_layer_affine_l2_net = function() { |
| /* |
| * Gradient check for a two-layer, fully-connected, feed-forward |
| * network with ReLU nonlinearity and L2 loss. |
| * |
| * NOTE: This could result in a false-negative in which the test |
| * fails due to a kink being crossed in the ReLU nonlinearity. This |
| * occurs when the tests, f(x-h) and f(x+h), end up on opposite |
| * sides of the zero threshold of max(0, fx). For now, just run |
| * the tests again. In the future, we can explicitly check for |
| * this and rerun the test automatically. |
| */ |
| print("Grad checking a two-layer, fully-connected, feed-forward network with a ReLU " + |
| "nonlinearity, and an L2 loss function.") |
| |
| # Generate input data |
| N = 1000 # num examples |
| D = 100 # num features |
| yD = 5 # num targets |
| X = rand(rows=N, cols=D, pdf="normal") |
| y = rand(rows=N, cols=yD) |
| |
| # Create 2-layer, fully-connected network |
| M = 10 # number of hidden neurons |
| [W1, b1] = affine::init(D, M) |
| [W2, b2] = affine::init(M, yD) |
| W2 = W2 / sqrt(2) # different initialization, since being fed into l2 loss, instead of relu |
| |
| # Optimize for short "burn-in" time to move to characteristic |
| # mode of operation and unmask any real issues. |
| print(" - Burn-in:") |
| lr = 0.01 |
| decay = 0.99 |
| for(i in 1:5) { |
| # Compute forward and backward passes of net |
| [pred, loss, dX, dW1, db1, dW2, db2] = two_layer_affine_l2_net_run(X, y, W1, b1, W2, b2) |
| print(" - L2 loss: " + loss) |
| |
| # Optimize with basic SGD |
| W1 = W1 - lr * dW1 |
| b1 = b1 - lr * db1 |
| W2 = W2 - lr * dW2 |
| b2 = b2 - lr * db2 |
| lr = lr * decay |
| } |
| |
| # Compute analytical gradients |
| [pred, loss, dX, dW1, db1, dW2, db2] = two_layer_affine_l2_net_run(X, y, W1, b1, W2, b2) |
| |
| # Grad check |
| h = 1e-6 |
| print(" - Grad checking X.") |
| for (i in 1:2) { |
| for (j in 1:ncol(X)) { |
| # Compute numerical derivative |
| old_x = as.scalar(X[i,j]) |
| X[i,j] = old_x - h |
| [lossmh, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2) |
| X[i,j] = old_x + h |
| [lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2) |
| X[i,j] = old_x # reset X[i,j] |
| dX_num = (lossph-lossmh) / (2*h) # numerical derivative |
| |
| # Check error |
| rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh) |
| } |
| } |
| |
| print(" - Grad checking W1.") |
| for (i in 1:nrow(W1)) { |
| for (j in 1:ncol(W1)) { |
| # Compute numerical derivative |
| old_w = as.scalar(W1[i,j]) |
| W1[i,j] = old_w - h |
| [lossmh, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2) |
| W1[i,j] = old_w + h |
| [lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2) |
| W1[i,j] = old_w # reset W[i,j] |
| dWij_num = (lossph-lossmh) / (2*h) # numerical derivative |
| |
| # Check error |
| rel_error = test_util::check_rel_grad_error(as.scalar(dW1[i,j]), dWij_num, lossph, lossmh) |
| } |
| } |
| |
| print(" - Grad checking W2.") |
| for (i in 1:nrow(W2)) { |
| for (j in 1:ncol(W2)) { |
| # Compute numerical derivative |
| old_w = as.scalar(W2[i,j]) |
| W2[i,j] = old_w - h |
| [lossmh, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2) |
| W2[i,j] = old_w + h |
| [lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2) |
| W2[i,j] = old_w # reset W[i,j] |
| dWij_num = (lossph-lossmh) / (2*h) # numerical derivative |
| |
| # Check error |
| rel_error = test_util::check_rel_grad_error(as.scalar(dW2[i,j]), dWij_num, lossph, lossmh) |
| } |
| } |
| |
| print(" - Grad checking b1.") |
| for (i in 1:nrow(b1)) { |
| for (j in 1:ncol(b1)) { |
| # Compute numerical derivative |
| old_b = as.scalar(b1[i,j]) |
| b1[i,j] = old_b - h |
| [lossmh, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2) |
| b1[i,j] = old_b + h |
| [lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2) |
| b1[i,j] = old_b # reset b[1,j] |
| dbij_num = (lossph-lossmh) / (2*h) # numerical derivative |
| |
| # Check error |
| rel_error = test_util::check_rel_grad_error(as.scalar(db1[i,j]), dbij_num, lossph, lossmh) |
| } |
| } |
| |
| print(" - Grad checking b2.") |
| for (i in 1:nrow(b2)) { |
| for (j in 1:ncol(b2)) { |
| # Compute numerical derivative |
| old_b = as.scalar(b2[i,j]) |
| b2[i,j] = old_b - h |
| [lossmh, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2) |
| b2[i,j] = old_b + h |
| [lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2) |
| b2[i,j] = old_b # reset b[1,j] |
| dbij_num = (lossph-lossmh) / (2*h) # numerical derivative |
| |
| # Check error |
| rel_error = test_util::check_rel_grad_error(as.scalar(db2[i,j]), dbij_num, lossph, lossmh) |
| } |
| } |
| } |
| |
| /* |
| * Test network with forward/backward functions. |
| */ |
| two_layer_affine_l2_net_run = function(matrix[double] X, matrix[double] y, |
| matrix[double] W1, matrix[double] b1, |
| matrix[double] W2, matrix[double] b2) |
| return (matrix[double] pred, double loss, |
| matrix[double] dX, |
| matrix[double] dW1, matrix[double] db1, |
| matrix[double] dW2, matrix[double] db2) { |
| # Compute forward pass |
| [loss, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2) |
| |
| # Compute backward pass |
| [dX, dpred, daout, dhout, dW1, db1, dW2, db2] = |
| two_layer_affine_l2_net_backward(X, y, pred, aout, hout, W1, b1, W2, b2) |
| } |
| |
| two_layer_affine_l2_net_forward = function(matrix[double] X, matrix[double] y, |
| matrix[double] W1, matrix[double] b1, |
| matrix[double] W2, matrix[double] b2) |
| return (double loss, matrix[double] pred, matrix[double] aout, matrix[double] hout) { |
| # Compute forward pass |
| hout = affine::forward(X, W1, b1) |
| aout = relu::forward(hout) |
| pred = affine::forward(aout, W2, b2) |
| |
| # Compute loss |
| loss = l2_loss::forward(pred, y) |
| } |
| |
| two_layer_affine_l2_net_backward = function(matrix[double] X, matrix[double] y, matrix[double] pred, |
| matrix[double] aout, matrix[double] hout, |
| matrix[double] W1, matrix[double] b1, |
| matrix[double] W2, matrix[double] b2) |
| return (matrix[double] dX, matrix[double] dpred, |
| matrix[double] daout, matrix[double] dhout, |
| matrix[double] dW1, matrix[double] db1, matrix[double] dW2, matrix[double] db2) { |
| # Compute backward pass |
| dpred = l2_loss::backward(pred, y) |
| [daout, dW2, db2] = affine::backward(dpred, aout, W2, b2) |
| dhout = relu::backward(daout, hout) |
| [dX, dW1, db1] = affine::backward(dhout, X, W1, b1) |
| } |
| |
| elu = function() { |
| /* |
| * Gradient check for ELU nonlinearity |
| * layer. |
| */ |
| print("Grad checking ELU nonlinearity layer with L2 loss.") |
| # Generate data |
| N = 3 # num examples |
| M = 10 # num neurons |
| |
| X = rand(rows=N, cols=M, min=-5, max=5) |
| y = rand(rows=N, cols=M) |
| |
| out = elu::forward(X, 1) |
| dout = l2_loss::backward(out, y) |
| dX = elu::backward(dout, X, 1) |
| |
| # Grad check |
| h = 1e-5 |
| print(" - Grad checking X.") |
| for (i in 1:nrow(X)) { |
| for (j in 1:ncol(X)) { |
| # Compute numerical derivative |
| old = as.scalar(X[i,j]) |
| X[i,j] = old - h |
| outmh = elu::forward(X, 1) |
| lossmh = l2_loss::forward(outmh, y) |
| X[i,j] = old + h |
| outph = elu::forward(X, 1) |
| lossph = l2_loss::forward(outph, y) |
| X[i,j] = old # reset |
| dX_num = (lossph-lossmh) / (2*h) # numerical derivative |
| |
| # Check error |
| rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh) |
| } |
| } |
| } |