blob: c40970761f6bd8798913e7468768688cbeb361a6 [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
/*
* Gradient checks for various architectures.
*/
source("scripts/nn/layers/affine.dml") as affine
source("scripts/nn/layers/low_rank_affine.dml") as low_rank_affine
source("scripts/nn/layers/batch_norm1d.dml") as batch_norm1d
source("scripts/nn/layers/batch_norm2d.dml") as batch_norm2d
source("scripts/nn/layers/conv2d.dml") as conv2d
source("scripts/nn/layers/conv2d_builtin.dml") as conv2d_builtin
source("scripts/nn/layers/conv2d_depthwise.dml") as conv2d_depthwise
source("scripts/nn/layers/conv2d_transpose.dml") as conv2d_transpose
source("scripts/nn/layers/conv2d_transpose_depthwise.dml") as conv2d_transpose_depthwise
source("scripts/nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
source("scripts/nn/layers/cross_entropy_loss2d.dml") as cross_entropy_loss2d
source("scripts/nn/layers/dropout.dml") as dropout
source("scripts/nn/layers/fm.dml") as fm
source("scripts/nn/layers/l1_loss.dml") as l1_loss
source("scripts/nn/layers/l1_reg.dml") as l1_reg
source("scripts/nn/layers/l2_loss.dml") as l2_loss
source("scripts/nn/layers/l2_reg.dml") as l2_reg
source("scripts/nn/layers/log_loss.dml") as log_loss
source("scripts/nn/layers/lstm.dml") as lstm
source("scripts/nn/layers/max_pool2d.dml") as max_pool2d
source("scripts/nn/layers/max_pool2d_builtin.dml") as max_pool2d_builtin
source("scripts/nn/layers/avg_pool2d_builtin.dml") as avg_pool2d_builtin
source("scripts/nn/layers/upsample2d.dml") as upsample2d
source("scripts/nn/layers/relu.dml") as relu
source("scripts/nn/layers/leaky_relu.dml") as leaky_relu
source("scripts/nn/layers/rnn.dml") as rnn
source("scripts/nn/layers/scale_shift1d.dml") as scale_shift1d
source("scripts/nn/layers/scale_shift2d.dml") as scale_shift2d
source("scripts/nn/layers/sigmoid.dml") as sigmoid
source("scripts/nn/layers/softmax.dml") as softmax
source("scripts/nn/layers/softmax2d.dml") as softmax2d
source("scripts/nn/layers/tanh.dml") as tanh
source("scripts/nn/test/conv2d_simple.dml") as conv2d_simple
source("scripts/nn/test/max_pool2d_simple.dml") as max_pool2d_simple
source("scripts/nn/test/util.dml") as test_util
source("scripts/nn/util.dml") as util
source("scripts/nn/layers/elu.dml") as elu
affine = function() {
/*
* Gradient check for the affine layer.
*/
print("Grad checking the affine layer with L2 loss.")
# Generate data
N = 3 # num examples
D = 100 # num features
M = 10 # num neurons
X = rand(rows=N, cols=D)
y = rand(rows=N, cols=M)
[W, b] = affine::init(D, M)
# Compute analytical gradients of loss wrt parameters
out = affine::forward(X, W, b)
dout = l2_loss::backward(out, y)
[dX, dW, db] = affine::backward(dout, X, W, b)
# Grad check
h = 1e-5
print(" - Grad checking X.")
for (i in 1:nrow(X)) {
for (j in 1:ncol(X)) {
# Compute numerical derivative
old = as.scalar(X[i,j])
X[i,j] = old - h
outmh = affine::forward(X, W, b)
lossmh = l2_loss::forward(outmh, y)
X[i,j] = old + h
outph = affine::forward(X, W, b)
lossph = l2_loss::forward(outph, y)
X[i,j] = old # reset
dX_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
}
}
print(" - Grad checking W.")
for (i in 1:nrow(W)) {
for (j in 1:ncol(W)) {
# Compute numerical derivative
old = as.scalar(W[i,j])
W[i,j] = old - h
outmh = affine::forward(X, W, b)
lossmh = l2_loss::forward(outmh, y)
W[i,j] = old + h
outph = affine::forward(X, W, b)
lossph = l2_loss::forward(outph, y)
W[i,j] = old # reset
dW_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
}
}
print(" - Grad checking b.")
for (i in 1:nrow(b)) {
for (j in 1:ncol(b)) {
# Compute numerical derivative
old = as.scalar(b[i,j])
b[i,j] = old - h
outmh = affine::forward(X, W, b)
lossmh = l2_loss::forward(outmh, y)
b[i,j] = old + h
outph = affine::forward(X, W, b)
lossph = l2_loss::forward(outph, y)
b[i,j] = old # reset
db_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
}
}
}
low_rank_affine = function() {
/*
* Gradient check for the low rank affine layer.
*/
print("Grad checking the low rank affine layer with L2 loss.")
# Generate data
N = 3 # num examples
D = 100 # num features
M = 10 # num neurons
R = 2 # rank
X = rand(rows=N, cols=D)
y = rand(rows=N, cols=M)
[U, V, b] = low_rank_affine::init(D, M, R)
# Compute analytical gradients of loss wrt parameters
out = low_rank_affine::forward(X, U, V, b)
dout = l2_loss::backward(out, y)
[dX, dU, dV, db] = low_rank_affine::backward(dout, X, U, V, b)
# Grad check
h = 1e-5
print(" - Grad checking X.")
for (i in 1:nrow(X)) {
for (j in 1:ncol(X)) {
# Compute numerical derivative
old = as.scalar(X[i,j])
X[i,j] = old - h
outmh = low_rank_affine::forward(X, U, V, b)
lossmh = l2_loss::forward(outmh, y)
X[i,j] = old + h
outph = low_rank_affine::forward(X, U, V, b)
lossph = l2_loss::forward(outph, y)
X[i,j] = old # reset
dX_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
}
}
print(" - Grad checking U.")
for (i in 1:nrow(U)) {
for (j in 1:ncol(U)) {
# Compute numerical derivative
old = as.scalar(U[i,j])
U[i,j] = old - h
outmh = low_rank_affine::forward(X, U, V, b)
lossmh = l2_loss::forward(outmh, y)
U[i,j] = old + h
outph = low_rank_affine::forward(X, U, V, b)
lossph = l2_loss::forward(outph, y)
U[i,j] = old # reset
dU_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dU[i,j]), dU_num, lossph, lossmh)
}
}
print(" - Grad checking V.")
for (i in 1:nrow(V)) {
for (j in 1:ncol(V)) {
# Compute numerical derivative
old = as.scalar(V[i,j])
V[i,j] = old - h
outmh = low_rank_affine::forward(X, U, V, b)
lossmh = l2_loss::forward(outmh, y)
V[i,j] = old + h
outph = low_rank_affine::forward(X, U, V, b)
lossph = l2_loss::forward(outph, y)
V[i,j] = old # reset
dV_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dV[i,j]), dV_num, lossph, lossmh)
}
}
print(" - Grad checking b.")
for (i in 1:nrow(b)) {
for (j in 1:ncol(b)) {
# Compute numerical derivative
old = as.scalar(b[i,j])
b[i,j] = old - h
outmh = low_rank_affine::forward(X, U, V, b)
lossmh = l2_loss::forward(outmh, y)
b[i,j] = old + h
outph = low_rank_affine::forward(X, U, V, b)
lossph = l2_loss::forward(outph, y)
b[i,j] = old # reset
db_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
}
}
}
batch_norm1d = function() {
/*
* Gradient check for the 1D batch normalization layer.
*/
print("Grad checking the 1D batch normalization layer with L2 loss.")
# Generate data
N = 3 # num examples
D = 100 # num features
mu = 0.9 # momentum
eps = 1e-5 # epsilon
X = rand(rows=N, cols=D)
y = rand(rows=N, cols=D)
gamma = rand(rows=1, cols=D)
beta = rand(rows=1, cols=D)
ema_mean = rand(rows=1, cols=D)
ema_var = rand(rows=1, cols=D)
#[dummy, dummy, ema_mean, ema_var] = batch_norm1d::init(D)
# Check training & testing modes
for (i in 1:2) {
if (i == 1)
mode = 'train'
else
mode = 'test'
print(" - Grad checking the '"+mode+"' mode.")
# Compute analytical gradients of loss wrt parameters
[out, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
dout = l2_loss::backward(out, y)
[dX, dgamma, dbeta] = batch_norm1d::backward(dout, out, ema_mean_upd, ema_var_upd,
cache_mean, cache_var, cache_norm,
X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
# Grad check
h = 1e-5
print(" - Grad checking X.")
for (i in 1:nrow(X)) {
for (j in 1:ncol(X)) {
# Compute numerical derivative
old = as.scalar(X[i,j])
X[i,j] = old - h
[outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
lossmh = l2_loss::forward(outmh, y)
X[i,j] = old + h
[outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
lossph = l2_loss::forward(outph, y)
X[i,j] = old # reset
dX_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
}
}
print(" - Grad checking gamma.")
for (i in 1:nrow(gamma)) {
for (j in 1:ncol(gamma)) {
# Compute numerical derivative
old = as.scalar(gamma[i,j])
gamma[i,j] = old - h
[outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
lossmh = l2_loss::forward(outmh, y)
gamma[i,j] = old + h
[outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
lossph = l2_loss::forward(outph, y)
gamma[i,j] = old # reset
dgamma_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dgamma[i,j]), dgamma_num,
lossph, lossmh)
}
}
print(" - Grad checking beta.")
for (i in 1:nrow(beta)) {
for (j in 1:ncol(beta)) {
# Compute numerical derivative
old = as.scalar(beta[i,j])
beta[i,j] = old - h
[outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
lossmh = l2_loss::forward(outmh, y)
beta[i,j] = old + h
[outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
lossph = l2_loss::forward(outph, y)
beta[i,j] = old # reset
dbeta_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dbeta[i,j]), dbeta_num,
lossph, lossmh)
}
}
}
}
batch_norm2d = function() {
/*
* Gradient check for the 2D (spatial) batch normalization layer.
*/
print("Grad checking the 2D (spatial) batch normalization layer with L2 loss.")
# Generate data
N = 3 # num examples
C = 2 # num channels
Hin = 5 # input height
Win = 5 # input width
mu = 0.9 # momentum
eps = 1e-5 # epsilon
X = rand(rows=N, cols=C*Hin*Win)
y = rand(rows=N, cols=C*Hin*Win)
gamma = rand(rows=C, cols=1)
beta = rand(rows=C, cols=1)
ema_mean = rand(rows=C, cols=1)
ema_var = rand(rows=C, cols=1)
#[dummy, dummy, ema_mean, ema_var] = batch_norm2d::init(C)
# Check training & testing modes
# for (i in 1:1) {
mode = 'train'
print(" - Grad checking the '"+mode+"' mode.")
# Compute analytical gradients of loss wrt parameters
[out, ema_mean_upd, ema_var_upd, cache_mean, cache_var] =
batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
dout = l2_loss::backward(out, y)
[dX, dgamma, dbeta] = batch_norm2d::backward(dout, cache_mean, cache_var,
X, gamma, C, Hin, Win, eps)
# Grad check
h = 1e-5
print(" - Grad checking X.")
for (i in 1:nrow(X)) {
for (j in 1:ncol(X)) {
# Compute numerical derivative
old = as.scalar(X[i,j])
X[i,j] = old - h
[outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var] =
batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
lossmh = l2_loss::forward(outmh, y)
X[i,j] = old + h
[outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var] =
batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
lossph = l2_loss::forward(outph, y)
X[i,j] = old # reset
dX_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
}
}
print(" - Grad checking gamma.")
for (i in 1:nrow(gamma)) {
for (j in 1:ncol(gamma)) {
# Compute numerical derivative
old = as.scalar(gamma[i,j])
gamma[i,j] = old - h
[outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var] =
batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
lossmh = l2_loss::forward(outmh, y)
gamma[i,j] = old + h
[outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var] =
batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
lossph = l2_loss::forward(outph, y)
gamma[i,j] = old # reset
dgamma_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dgamma[i,j]), dgamma_num,
lossph, lossmh)
}
}
print(" - Grad checking beta.")
for (i in 1:nrow(beta)) {
for (j in 1:ncol(beta)) {
# Compute numerical derivative
old = as.scalar(beta[i,j])
beta[i,j] = old - h
[outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var] =
batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
lossmh = l2_loss::forward(outmh, y)
beta[i,j] = old + h
[outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var] =
batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
lossph = l2_loss::forward(outph, y)
beta[i,j] = old # reset
dbeta_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dbeta[i,j]), dbeta_num,
lossph, lossmh)
}
}
# }
}
conv2d = function() {
/*
* Gradient check for the 2D convolutional layer using `im2col`.
*/
print("Grad checking the `im2col` 2D convolutional layer with L2 loss.")
# Generate data
N = 2 # num examples
C = 3 # num channels
Hin = 5 # input height
Win = 5 # input width
F = 4 # num filters
Hf = 3 # filter height
Wf = 3 # filter width
stride = 1
pad = 1
X = rand(rows=N, cols=C*Hin*Win)
y = rand(rows=N, cols=F*Hin*Win)
# Create layers
[W, b] = conv2d::init(F, C, Hf, Wf)
# Compute analytical gradients of loss wrt parameters
[out, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
dout = l2_loss::backward(out, y)
[dX, dW, db] = conv2d::backward(dout, Hout, Wout, X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
pad, pad)
# Grad check
h = 1e-5
print(" - Grad checking X.")
for (i in 1:nrow(X)) {
for (j in 1:ncol(X)) {
# Compute numerical derivative
old = as.scalar(X[i,j])
X[i,j] = old - h
[outmh, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
lossmh = l2_loss::forward(outmh, y)
X[i,j] = old + h
[outph, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
lossph = l2_loss::forward(outph, y)
X[i,j] = old # reset
dX_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
}
}
print(" - Grad checking W.")
for (i in 1:nrow(W)) {
for (j in 1:ncol(W)) {
# Compute numerical derivative
old = as.scalar(W[i,j])
W[i,j] = old - h
[outmh, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
lossmh = l2_loss::forward(outmh, y)
W[i,j] = old + h
[outph, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
lossph = l2_loss::forward(outph, y)
W[i,j] = old # reset
dW_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
}
}
print(" - Grad checking b.")
for (i in 1:nrow(b)) {
for (j in 1:ncol(b)) {
# Compute numerical derivative
old = as.scalar(b[i,j])
b[i,j] = old - h
[outmh, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
lossmh = l2_loss::forward(outmh, y)
b[i,j] = old + h
[outph, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
lossph = l2_loss::forward(outph, y)
b[i,j] = old # reset
db_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
}
}
}
conv2d_builtin = function() {
/*
* Gradient check for the 2D convolutional layer using built-in
* functions.
*/
print("Grad checking the built-in 2D convolutional layer with L2 loss.")
# Generate data
N = 2 # num examples
C = 3 # num channels
Hin = 5 # input height
Win = 5 # input width
F = 4 # num filters
Hf = 3 # filter height
Wf = 3 # filter width
stride = 1
pad = 1
X = rand(rows=N, cols=C*Hin*Win)
y = rand(rows=N, cols=F*Hin*Win)
# Create layers
[W, b] = conv2d_builtin::init(F, C, Hf, Wf)
# Compute analytical gradients of loss wrt parameters
[out, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
pad, pad)
dout = l2_loss::backward(out, y)
[dX, dW, db] = conv2d_builtin::backward(dout, Hout, Wout, X, W, b, C, Hin, Win, Hf, Wf,
stride, stride, pad, pad)
# Grad check
h = 1e-5
print(" - Grad checking X.")
for (i in 1:nrow(X)) {
for (j in 1:ncol(X)) {
# Compute numerical derivative
old = as.scalar(X[i,j])
X[i,j] = old - h
[outmh, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
pad, pad)
lossmh = l2_loss::forward(outmh, y)
X[i,j] = old + h
[outph, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
pad, pad)
lossph = l2_loss::forward(outph, y)
X[i,j] = old # reset
dX_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
}
}
print(" - Grad checking W.")
for (i in 1:nrow(W)) {
for (j in 1:ncol(W)) {
# Compute numerical derivative
old = as.scalar(W[i,j])
W[i,j] = old - h
[outmh, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
pad, pad)
lossmh = l2_loss::forward(outmh, y)
W[i,j] = old + h
[outph, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
pad, pad)
lossph = l2_loss::forward(outph, y)
W[i,j] = old # reset
dW_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
}
}
print(" - Grad checking b.")
for (i in 1:nrow(b)) {
for (j in 1:ncol(b)) {
# Compute numerical derivative
old = as.scalar(b[i,j])
b[i,j] = old - h
[outmh, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
pad, pad)
lossmh = l2_loss::forward(outmh, y)
b[i,j] = old + h
[outph, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
pad, pad)
lossph = l2_loss::forward(outph, y)
b[i,j] = old # reset
db_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
}
}
}
conv2d_simple = function() {
/*
* Gradient check for the simple reference 2D convolutional layer.
*/
print("Grad checking the simple reference 2D convolutional layer with L2 loss.")
# Generate data
N = 2 # num examples
C = 3 # num channels
Hin = 5 # input height
Win = 5 # input width
F = 4 # num filters
Hf = 3 # filter height
Wf = 3 # filter width
stride = 1
pad = 1
X = rand(rows=N, cols=C*Hin*Win)
y = rand(rows=N, cols=F*Hin*Win)
# Create layers
[W, b] = conv2d_simple::init(F, C, Hf, Wf)
# Compute analytical gradients of loss wrt parameters
[out, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
dout = l2_loss::backward(out, y)
[dX, dW, db] = conv2d_simple::backward(dout, Hout, Wout, X, W, b, C, Hin, Win, Hf, Wf,
stride, stride, pad, pad)
# Grad check
h = 1e-5
print(" - Grad checking X.")
for (i in 1:nrow(X)) {
for (j in 1:ncol(X)) {
# Compute numerical derivative
old = as.scalar(X[i,j])
X[i,j] = old - h
[outmh, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
pad, pad)
lossmh = l2_loss::forward(outmh, y)
X[i,j] = old + h
[outph, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
pad, pad)
lossph = l2_loss::forward(outph, y)
X[i,j] = old # reset
dX_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
}
}
print(" - Grad checking W.")
for (i in 1:nrow(W)) {
for (j in 1:ncol(W)) {
# Compute numerical derivative
old = as.scalar(W[i,j])
W[i,j] = old - h
[outmh, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
pad, pad)
lossmh = l2_loss::forward(outmh, y)
W[i,j] = old + h
[outph, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
pad, pad)
lossph = l2_loss::forward(outph, y)
W[i,j] = old # reset
dW_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
}
}
print(" - Grad checking b.")
for (i in 1:nrow(b)) {
for (j in 1:ncol(b)) {
# Compute numerical derivative
old = as.scalar(b[i,j])
b[i,j] = old - h
[outmh, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
pad, pad)
lossmh = l2_loss::forward(outmh, y)
b[i,j] = old + h
[outph, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
pad, pad)
lossph = l2_loss::forward(outph, y)
b[i,j] = old # reset
db_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
}
}
}
conv2d_depthwise = function() {
/*
* Gradient check for the 2D depthwise convolutional layer.
*/
print("Grad checking the 2D depthwise convolutional layer with L2 loss.")
# Generate data
N = 2 # num examples
C = 3 # num channels
Hin = 5 # input height
Win = 5 # input width
M = 4 # depth multiplier
Hf = 3 # filter height
Wf = 3 # filter width
stride = 1
pad = 1
X = rand(rows=N, cols=C*Hin*Win)
y = rand(rows=N, cols=C*M*Hin*Win)
# Create layers
[W, b] = conv2d_depthwise::init(C, M, Hf, Wf)
# Compute analytical gradients of loss wrt parameters
[out, Hout, Wout] = conv2d_depthwise::forward(X, W, b, Hin, Win, M, Hf, Wf, stride, stride,
pad, pad)
dout = l2_loss::backward(out, y)
[dX, dW, db] = conv2d_depthwise::backward(dout, Hout, Wout, X, W, b, Hin, Win, M, Hf, Wf,
stride, stride, pad, pad)
# Grad check
h = 1e-5
print(" - Grad checking X.")
for (i in 1:nrow(X)) {
for (j in 1:ncol(X)) {
# Compute numerical derivative
old = as.scalar(X[i,j])
X[i,j] = old - h
[outmh, Hout, Wout] = conv2d_depthwise::forward(X, W, b, Hin, Win, M, Hf, Wf, stride, stride,
pad, pad)
lossmh = l2_loss::forward(outmh, y)
X[i,j] = old + h
[outph, Hout, Wout] = conv2d_depthwise::forward(X, W, b, Hin, Win, M, Hf, Wf, stride, stride,
pad, pad)
lossph = l2_loss::forward(outph, y)
X[i,j] = old # reset
dX_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
}
}
print(" - Grad checking W.")
for (i in 1:nrow(W)) {
for (j in 1:ncol(W)) {
# Compute numerical derivative
old = as.scalar(W[i,j])
W[i,j] = old - h
[outmh, Hout, Wout] = conv2d_depthwise::forward(X, W, b, Hin, Win, M, Hf, Wf, stride, stride,
pad, pad)
lossmh = l2_loss::forward(outmh, y)
W[i,j] = old + h
[outph, Hout, Wout] = conv2d_depthwise::forward(X, W, b, Hin, Win, M, Hf, Wf, stride, stride,
pad, pad)
lossph = l2_loss::forward(outph, y)
W[i,j] = old # reset
dW_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
}
}
print(" - Grad checking b.")
for (i in 1:nrow(b)) {
for (j in 1:ncol(b)) {
# Compute numerical derivative
old = as.scalar(b[i,j])
b[i,j] = old - h
[outmh, Hout, Wout] = conv2d_depthwise::forward(X, W, b, Hin, Win, M, Hf, Wf, stride, stride,
pad, pad)
lossmh = l2_loss::forward(outmh, y)
b[i,j] = old + h
[outph, Hout, Wout] = conv2d_depthwise::forward(X, W, b, Hin, Win, M, Hf, Wf, stride, stride,
pad, pad)
lossph = l2_loss::forward(outph, y)
b[i,j] = old # reset
db_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
}
}
}
conv2d_transpose = function() {
/*
* Gradient check for the 2D transpose convolutional layer.
*/
print("Grad checking the 2D transpose convolutional layer with L2 loss.")
# Generate data
N = 2 # num examples
C = 2 # num channels
Hin = 3 # input height
Win = 3 # input width
F = 2 # num filters
Hf = 3 # filter height
Wf = 3 # filter width
stride = 2
pad = 1
out_pad = 1
X = rand(rows=N, cols=C*Hin*Win)
# Create layers
[W, b] = conv2d_transpose::init(F, C, Hf, Wf)
# Compute analytical gradients of loss wrt parameters
[out, Hout, Wout] = conv2d_transpose::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
pad, pad, out_pad, out_pad)
y = rand(rows=N, cols=F*Hout*Wout)
dout = l2_loss::backward(out,y)
[dX, dW, db] = conv2d_transpose::backward(dout, Hout, Wout, X, W, b, C, Hin, Win, Hf, Wf,
stride, stride, pad, pad)
# Grad check
h = 1e-5
print(" - Grad checking X.")
for (i in 1:nrow(X)) {
for (j in 1:ncol(X)) {
# Compute numerical derivative
old = as.scalar(X[i,j])
X[i,j] = old - h
[outmh, Hout, Wout] = conv2d_transpose::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
pad, pad, out_pad, out_pad)
lossmh = l2_loss::forward(outmh, y)
X[i,j] = old + h
[outph, Hout, Wout] = conv2d_transpose::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
pad, pad, out_pad, out_pad)
lossph = l2_loss::forward(outph, y)
X[i,j] = old # reset
dX_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
}
}
print(" - Grad checking W.")
for (i in 1:nrow(W)) {
for (j in 1:ncol(W)) {
# Compute numerical derivative
old = as.scalar(W[i,j])
W[i,j] = old - h
[outmh, Hout, Wout] = conv2d_transpose::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
pad, pad, out_pad, out_pad)
lossmh = l2_loss::forward(outmh, y)
W[i,j] = old + h
[outph, Hout, Wout] = conv2d_transpose::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
pad, pad, out_pad, out_pad)
lossph = l2_loss::forward(outph, y)
W[i,j] = old # reset
dW_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
}
}
print(" - Grad checking b.")
for (i in 1:nrow(b)) {
for (j in 1:ncol(b)) {
# Compute numerical derivative
old = as.scalar(b[i,j])
b[i,j] = old - h
[outmh, Hout, Wout] = conv2d_transpose::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
pad, pad, out_pad, out_pad)
lossmh = l2_loss::forward(outmh, y)
b[i,j] = old + h
[outph, Hout, Wout] = conv2d_transpose::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
pad, pad, out_pad, out_pad)
lossph = l2_loss::forward(outph, y)
b[i,j] = old # reset
db_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
}
}
}
conv2d_transpose_depthwise = function() {
/*
* Gradient check for the 2D depthwise transpose convolutional layer.
*/
print("Grad checking the 2D depthwise transpose convolutional layer with L2 loss.")
# Generate data
N = 2 # num examples
C = 8 # num channels
Hin = 3 # input height
Win = 3 # input width
M = 4 # depth of filters
Hf = 3 # filter height
Wf = 3 # filter width
stride = 2
pad = 1
out_pad = 1
X = rand(rows=N, cols=C*Hin*Win)
# Create layers
[W, b] = conv2d_transpose_depthwise::init(C, M, Hf, Wf)
# Compute analytical gradients of loss wrt parameters
[out, Hout, Wout] = conv2d_transpose_depthwise::forward(X, W, b, C, Hin, Win, M, Hf, Wf,
stride, stride, pad, pad,
out_pad, out_pad)
y = rand(rows=N, cols=C/M*Hout*Wout)
dout = l2_loss::backward(out,y)
[dX, dW, db] = conv2d_transpose_depthwise::backward(dout, Hout, Wout, X, W, b, C, Hin, Win, M,
Hf, Wf, stride, stride, pad, pad)
# Grad check
h = 1e-5
print(" - Grad checking X.")
for (i in 1:nrow(X)) {
for (j in 1:ncol(X)) {
# Compute numerical derivative
old = as.scalar(X[i,j])
X[i,j] = old - h
[outmh, Hout, Wout] = conv2d_transpose_depthwise::forward(X, W, b, C, Hin, Win, M, Hf, Wf,
stride, stride, pad, pad,
out_pad, out_pad)
lossmh = l2_loss::forward(outmh, y)
X[i,j] = old + h
[outph, Hout, Wout] = conv2d_transpose_depthwise::forward(X, W, b, C, Hin, Win, M, Hf, Wf,
stride, stride, pad, pad,
out_pad, out_pad)
lossph = l2_loss::forward(outph, y)
X[i,j] = old # reset
dX_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
}
}
print(" - Grad checking W.")
for (i in 1:nrow(W)) {
for (j in 1:ncol(W)) {
# Compute numerical derivative
old = as.scalar(W[i,j])
W[i,j] = old - h
[outmh, Hout, Wout] = conv2d_transpose_depthwise::forward(X, W, b, C, Hin, Win, M, Hf, Wf,
stride, stride, pad, pad,
out_pad, out_pad)
lossmh = l2_loss::forward(outmh, y)
W[i,j] = old + h
[outph, Hout, Wout] = conv2d_transpose_depthwise::forward(X, W, b, C, Hin, Win, M, Hf, Wf,
stride, stride, pad, pad,
out_pad, out_pad)
lossph = l2_loss::forward(outph, y)
W[i,j] = old # reset
dW_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
}
}
print(" - Grad checking b.")
for (i in 1:nrow(b)) {
for (j in 1:ncol(b)) {
# Compute numerical derivative
old = as.scalar(b[i,j])
b[i,j] = old - h
[outmh, Hout, Wout] = conv2d_transpose_depthwise::forward(X, W, b, C, Hin, Win, M, Hf, Wf,
stride, stride, pad, pad,
out_pad, out_pad)
lossmh = l2_loss::forward(outmh, y)
b[i,j] = old + h
[outph, Hout, Wout] = conv2d_transpose_depthwise::forward(X, W, b, C, Hin, Win, M, Hf, Wf,
stride, stride, pad, pad,
out_pad, out_pad)
lossph = l2_loss::forward(outph, y)
b[i,j] = old # reset
db_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
}
}
}
cross_entropy_loss = function() {
/*
* Gradient check for the cross-entropy loss function.
*/
print("Grad checking the cross-entropy loss function.")
# Generate data
N = 3 # num examples
K = 10 # num targets
pred = rand(rows=N, cols=K, min=0, max=1, pdf="uniform")
pred = softmax::forward(pred) # normalized probs
y = rand(rows=N, cols=K, min=0, max=1, pdf="uniform")
y = softmax::forward(y) # normalized probs
# Compute analytical gradient
dpred = cross_entropy_loss::backward(pred, y)
# Grad check
h = 1e-5
for (i in 1:nrow(pred)) {
for (j in 1:ncol(pred)) {
# Compute numerical derivative
old = as.scalar(pred[i,j])
pred[i,j] = old - h
lossmh = cross_entropy_loss::forward(pred, y)
pred[i,j] = old + h
lossph = cross_entropy_loss::forward(pred, y)
pred[i,j] = old # reset W[i,j]
dpred_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dpred[i,j]), dpred_num, lossph, lossmh)
}
}
}
cross_entropy_loss2d = function() {
/*
* Gradient check for the 2D cross-entropy loss function.
*/
print("Grad checking the 2D cross-entropy loss function.")
# Generate data
N = 3 # num examples
C = 10 # num targets
Hin = 5 # example height
Win = 5 # example width
pred = rand(rows=N, cols=C*Hin*Win, min=0, max=1, pdf="uniform")
pred = softmax2d::forward(pred, C) # normalized probs
y = rand(rows=N, cols=C*Hin*Win, min=0, max=1, pdf="uniform")
y = softmax2d::forward(y, C) # normalized probs
# Compute analytical gradient
dpred = cross_entropy_loss2d::backward(pred, y, C)
# Grad check
h = 1e-6
for (i in 1:nrow(pred)) {
for (j in 1:ncol(pred)) {
# Compute numerical derivative
old = as.scalar(pred[i,j])
pred[i,j] = old - h
lossmh = cross_entropy_loss2d::forward(pred, y, C)
pred[i,j] = old + h
lossph = cross_entropy_loss2d::forward(pred, y, C)
pred[i,j] = old # reset W[i,j]
dpred_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dpred[i,j]), dpred_num, lossph, lossmh)
}
}
}
dropout = function() {
/*
* Gradient check for the (inverted) dropout layer.
*/
print("Grad checking the (inverted) dropout layer with L2 loss.")
# Generate data
N = 3 # num examples
M = 100 # num neurons
p = 0.5 # probability of dropping neuron output
seed = as.integer(floor(as.scalar(rand(rows=1, cols=1, min=1, max=100000)))) # random seed
X = rand(rows=N, cols=M)
y = rand(rows=N, cols=M)
# Compute analytical gradients of loss wrt parameters
[out, mask] = dropout::forward(X, p, seed)
dout = l2_loss::backward(out, y)
dX = dropout::backward(dout, X, p, mask)
# Grad check
h = 1e-5
for (i in 1:nrow(X)) {
for (j in 1:ncol(X)) {
# Compute numerical derivative
old = as.scalar(X[i,j])
X[i,j] = old - h
[outmh, mask] = dropout::forward(X, p, seed)
lossmh = l2_loss::forward(outmh, y)
X[i,j] = old + h
[outph, mask] = dropout::forward(X, p, seed)
lossph = l2_loss::forward(outph, y)
X[i,j] = old # reset
dX_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
}
}
}
fm = function() {
/*
* Gradient check for the factorization machines.
*/
print("Grad checking the factorization machines with L2 loss.")
# Generate data
n = 5# num examples
d = 100 # num features
k = 2 # factorization dimensionality
X = rand(rows=n, cols=d)
y = rand(rows=n, cols=1)
[w0, W, V] = fm::init(d, k)
# Compute analytical gradients of loss wrt parameters
out = fm::forward(X, w0, W, V)
dout = l2_loss::backward(out, y)
[dw0, dW, dV] = fm::backward(dout, X, w0, W, V)
# Grad check
h = 1e-5
print(" - Grad checking w0.")
for (i in 1:nrow(w0)) {
for (j in 1:ncol(w0)) {
# Compute numerical derivative
old = as.scalar(w0[i,j])
w0[i,j] = old - h # h = 1e-5
outmh = fm::forward(X, w0, W, V)
lossmh = l2_loss::forward(outmh, y)
w0[i,j] = old + h # h = 1e-5
outph = fm::forward(X, w0, W, V)
lossph = l2_loss::forward(outph, y)
w0[i,j] = old # reset
dw0_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dw0[i,j]), dw0_num, lossph, lossmh)
}
}
print(" - Grad checking W.")
for (i in 1:nrow(W)) {
for (j in 1:ncol(W)) {
# Compute numerical derivative
old = as.scalar(W[i,j])
W[i,j] = old - h # h = 1e-5
outmh = fm::forward(X, w0, W, V)
lossmh = l2_loss::forward(outmh, y)
W[i,j] = old + h # h = 1e-5
outph = fm::forward(X, w0, W, V)
lossph = l2_loss::forward(outph, y)
W[i,j] = old # reset
dW_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
}
}
print(" - Grad checking V.")
for (i in 1:nrow(V)) {
for(i in 1:ncol(V)) {
# Compute numerical derivative
old = as.scalar(V[i,j])
V[i,j] = old - h # h = 1e-5
outmh = fm::forward(X, w0, W, V)
lossmh = l2_loss::forward(outmh, y)
V[i,j] = old + h # h= 1e-5
outph = fm::forward(X, w0, W, V)
lossph = l2_loss::forward(outph, y)
V[i,j] = old # reset
dV_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dV[i,j]), dV_num, lossph, lossmh)
}
}
}
l1_loss = function() {
/*
* Gradient check for the L1 loss function.
*/
print("Grad checking the L1 loss function.")
# Generate data
N = 3 # num examples
D = 2 # num targets
pred = rand(rows=N, cols=D)
y = rand(rows=N, cols=D)
# Compute analytical gradient
dpred = l1_loss::backward(pred, y)
# Grad check
h = 1e-5
for (i in 1:nrow(pred)) {
for (j in 1:ncol(pred)) {
# Compute numerical derivative
old = as.scalar(pred[i,j])
pred[i,j] = old - h
lossmh = l1_loss::forward(pred, y)
pred[i,j] = old + h
lossph = l1_loss::forward(pred, y)
pred[i,j] = old # reset W[i,j]
dpred_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dpred[i,j]), dpred_num, lossph, lossmh)
}
}
}
l1_reg = function() {
/*
* Gradient check for the L1 regularization function.
*/
print("Grad checking the L1 regularization function.")
# Generate data
D = 5 # num features
M = 3 # num neurons
lambda = 0.01
W = rand(rows=D, cols=M)
# Compute analytical gradient
dW = l1_reg::backward(W, lambda)
# Grad check
h = 1e-5
for (i in 1:nrow(W)) {
for (j in 1:ncol(W)) {
# Compute numerical derivative
old = as.scalar(W[i,j])
W[i,j] = old - h
reg_lossmh = l1_reg::forward(W, lambda)
W[i,j] = old + h
reg_lossph = l1_reg::forward(W, lambda)
W[i,j] = old # reset W[i,j]
dW_num = (reg_lossph-reg_lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num,
reg_lossph, reg_lossmh)
}
}
}
l2_loss = function() {
/*
* Gradient check for the L2 loss function.
*/
print("Grad checking the L2 loss function.")
# Generate data
N = 3 # num examples
D = 2 # num targets
pred = rand(rows=N, cols=D)
y = rand(rows=N, cols=D)
# Compute analytical gradient
dpred = l2_loss::backward(pred, y)
# Grad check
h = 1e-5
for (i in 1:nrow(pred)) {
for (j in 1:ncol(pred)) {
# Compute numerical derivative
old = as.scalar(pred[i,j])
pred[i,j] = old - h
lossmh = l2_loss::forward(pred, y)
pred[i,j] = old + h
lossph = l2_loss::forward(pred, y)
pred[i,j] = old # reset W[i,j]
dpred_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dpred[i,j]), dpred_num, lossph, lossmh)
}
}
}
l2_reg = function() {
/*
* Gradient check for the L2 regularization function.
*/
print("Grad checking the L2 regularization function.")
# Generate data
D = 5 # num features
M = 3 # num neurons
lambda = 0.01
W = rand(rows=D, cols=M)
# Compute analytical gradient
dW = l2_reg::backward(W, lambda)
# Grad check
h = 1e-5
for (i in 1:nrow(W)) {
for (j in 1:ncol(W)) {
# Compute numerical derivative
old = as.scalar(W[i,j])
W[i,j] = old - h
reg_lossmh = l2_reg::forward(W, lambda)
W[i,j] = old + h
reg_lossph = l2_reg::forward(W, lambda)
W[i,j] = old # reset W[i,j]
dW_num = (reg_lossph-reg_lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num,
reg_lossph, reg_lossmh)
}
}
}
log_loss = function() {
/*
* Gradient check for the log loss function.
*/
print("Grad checking the log loss function.")
# Generate data
N = 20 # num examples
D = 1 # num targets
pred = rand(rows=N, cols=D, min=0, max=1, pdf="uniform")
y = round(rand(rows=N, cols=D, min=0, max=1, pdf="uniform"))
# Compute analytical gradient
dpred = log_loss::backward(pred, y)
# Grad check
h = 1e-5
for (i in 1:nrow(pred)) {
for (j in 1:ncol(pred)) {
# Compute numerical derivative
old = as.scalar(pred[i,j])
pred[i,j] = old - h
lossmh = log_loss::forward(pred, y)
pred[i,j] = old + h
lossph = log_loss::forward(pred, y)
pred[i,j] = old # reset W[i,j]
dpred_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dpred[i,j]), dpred_num, lossph, lossmh)
}
}
}
lstm = function() {
/*
* Gradient check for the LSTM layer.
*/
print("Grad checking the LSTM layer with L2 loss.")
# Generate data
N = 3 # num examples
D = 5 # num features
T = 15 # num timesteps (sequence length)
M = 10 # num neurons
X = rand(rows=N, cols=T*D)
yc = rand(rows=N, cols=M)
out0 = rand(rows=N, cols=M)
c0 = rand(rows=N, cols=M)
[W, b, dummy, dummy2] = lstm::init(N, D, M)
# test with (1) outputs from all timesteps, and (2) output from the final timestep
for (i in 1:2) {
if (i == 1) {
return_seq = TRUE
y = rand(rows=N, cols=T*M)
}
else {
return_seq = FALSE
y = rand(rows=N, cols=M)
}
print(" - Grad checking with return_seq = " + return_seq)
# Compute analytical gradients of loss wrt parameters
[out, c, cache_out, cache_c, cache_ifog] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
dout = l2_loss::backward(out, y)
dc = l2_loss::backward(c, yc)
[dX, dW, db, dout0, dc0] = lstm::backward(dout, dc, X, W, b, T, D, return_seq, out0, c0,
cache_out, cache_c, cache_ifog)
# Grad check
h = 1e-5
print(" - Grad checking X.")
for (i in 1:nrow(X)) {
for (j in 1:ncol(X)) {
# Compute numerical derivative
old = as.scalar(X[i,j])
X[i,j] = old - h
[outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
loss_outmh = l2_loss::forward(outmh, y)
loss_cmh = l2_loss::forward(cmh, yc)
lossmh = loss_outmh + loss_cmh
X[i,j] = old + h
[outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
loss_outph = l2_loss::forward(outph, y)
loss_cph = l2_loss::forward(cph, yc)
lossph = loss_outph + loss_cph
X[i,j] = old # reset
dX_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
}
}
print(" - Grad checking W.")
for (i in 1:nrow(W)) {
for (j in 1:ncol(W)) {
# Compute numerical derivative
old = as.scalar(W[i,j])
W[i,j] = old - h
[outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
loss_outmh = l2_loss::forward(outmh, y)
loss_cmh = l2_loss::forward(cmh, yc)
lossmh = loss_outmh + loss_cmh
W[i,j] = old + h
[outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
loss_outph = l2_loss::forward(outph, y)
loss_cph = l2_loss::forward(cph, yc)
lossph = loss_outph + loss_cph
W[i,j] = old # reset
dW_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
}
}
print(" - Grad checking b.")
for (i in 1:nrow(b)) {
for (j in 1:ncol(b)) {
# Compute numerical derivative
old = as.scalar(b[i,j])
b[i,j] = old - h
[outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
loss_outmh = l2_loss::forward(outmh, y)
loss_cmh = l2_loss::forward(cmh, yc)
lossmh = loss_outmh + loss_cmh
b[i,j] = old + h
[outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
loss_outph = l2_loss::forward(outph, y)
loss_cph = l2_loss::forward(cph, yc)
lossph = loss_outph + loss_cph
b[i,j] = old # reset
db_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
}
}
print(" - Grad checking out0.")
for (i in 1:nrow(out0)) {
for (j in 1:ncol(out0)) {
# Compute numerical derivative
old = as.scalar(out0[i,j])
out0[i,j] = old - h
[outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
loss_outmh = l2_loss::forward(outmh, y)
loss_cmh = l2_loss::forward(cmh, yc)
lossmh = loss_outmh + loss_cmh
out0[i,j] = old + h
[outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
loss_outph = l2_loss::forward(outph, y)
loss_cph = l2_loss::forward(cph, yc)
lossph = loss_outph + loss_cph
out0[i,j] = old # reset
dout0_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dout0[i,j]), dout0_num, lossph, lossmh)
}
}
print(" - Grad checking c0.")
for (i in 1:nrow(c0)) {
for (j in 1:ncol(c0)) {
# Compute numerical derivative
old = as.scalar(c0[i,j])
c0[i,j] = old - h
[outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
loss_outmh = l2_loss::forward(outmh, y)
loss_cmh = l2_loss::forward(cmh, yc)
lossmh = loss_outmh + loss_cmh
c0[i,j] = old + h
[outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
loss_outph = l2_loss::forward(outph, y)
loss_cph = l2_loss::forward(cph, yc)
lossph = loss_outph + loss_cph
c0[i,j] = old # reset
dc0_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dc0[i,j]), dc0_num, lossph, lossmh)
}
}
}
}
max_pool2d = function() {
/*
* Gradient check for the 2D max pooling layer.
*/
print("Grad checking the 2D max pooling layer with L2 loss.")
# Generate data
N = 2 # num examples
C = 2 # num channels
Hin = 4 # input height
Win = 4 # input width
Hf = 2 # pool filter height
Wf = 2 # pool filter width
stride = 2
X = rand(rows=N, cols=C*Hin*Win)
for (pad in 0:1) {
print(" - Grad checking w/ pad="+pad+".")
Hout = as.integer(floor((Hin + 2*pad - Hf)/stride + 1))
Wout = as.integer(floor((Win + 2*pad - Wf)/stride + 1))
y = rand(rows=N, cols=C*Hout*Wout)
# Compute analytical gradients of loss wrt parameters
[out, Hout, Wout] = max_pool2d::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
dout = l2_loss::backward(out, y)
dX = max_pool2d::backward(dout, Hout, Wout, X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
# Grad check
h = 1e-5
for (i in 1:nrow(X)) {
for (j in 1:ncol(X)) {
# Compute numerical derivative
old = as.scalar(X[i,j])
X[i,j] = old - h
[outmh, Hout, Wout] = max_pool2d::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
lossmh = l2_loss::forward(outmh, y)
X[i,j] = old + h
[outph, Hout, Wout] = max_pool2d::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
lossph = l2_loss::forward(outph, y)
X[i,j] = old # reset
dX_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
}
}
}
}
max_pool2d_builtin = function() {
/*
* Gradient check for the 2D max pooling layer.
*/
print("Grad checking the built-in 2D max pooling layer with L2 loss.")
# Generate data
N = 2 # num examples
C = 2 # num channels
Hin = 4 # input height
Win = 4 # input width
Hf = 2 # pool filter height
Wf = 2 # pool filter width
stride = 2
X = rand(rows=N, cols=C*Hin*Win)
for (pad in 0:1) {
print(" - Grad checking w/ pad="+pad+".")
Hout = as.integer(floor((Hin + 2 * pad - Hf) / stride + 1))
Wout = as.integer(floor((Win + 2 * pad - Wf) / stride + 1))
y = rand(rows=N, cols=C*Hout*Wout)
# Compute analytical gradients of loss wrt parameters
[out, Hout, Wout] = max_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
pad, pad)
dout = l2_loss::backward(out, y)
dX = max_pool2d_builtin::backward(dout, Hout, Wout, X, C, Hin, Win, Hf, Wf, stride, stride,
pad, pad)
# Grad check
h = 1e-5
for (i in 1:nrow(X)) {
for (j in 1:ncol(X)) {
# Compute numerical derivative
old = as.scalar(X[i,j])
X[i,j] = old - h
[outmh, Hout, Wout] = max_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
pad, pad)
lossmh = l2_loss::forward(outmh, y)
X[i,j] = old + h
[outph, Hout, Wout] = max_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
pad, pad)
lossph = l2_loss::forward(outph, y)
X[i,j] = old # reset
dX_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
}
}
}
}
avg_pool2d_builtin = function() {
/*
* Gradient check for the 2D avg pooling layer.
*/
print("Grad checking the built-in 2D avg pooling layer with L2 loss.")
# Generate data
N = 2 # num examples
C = 2 # num channels
Hin = 4 # input height
Win = 4 # input width
Hf = 2 # pool filter height
Wf = 2 # pool filter width
stride = 2
X = rand(rows=N, cols=C*Hin*Win)
for (pad in 0:1) {
print(" - Grad checking w/ pad="+pad+".")
Hout = as.integer(floor((Hin + 2 * pad - Hf) / stride + 1))
Wout = as.integer(floor((Win + 2 * pad - Wf) / stride + 1))
y = rand(rows=N, cols=C*Hout*Wout)
# Compute analytical gradients of loss wrt parameters
[out, Hout, Wout] = avg_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
pad, pad)
dout = l2_loss::backward(out, y)
dX = avg_pool2d_builtin::backward(dout, Hout, Wout, X, C, Hin, Win, Hf, Wf, stride, stride,
pad, pad)
# Grad check
h = 1e-5
for (i in 1:nrow(X)) {
for (j in 1:ncol(X)) {
# Compute numerical derivative
old = as.scalar(X[i,j])
X[i,j] = old - h
[outmh, Hout, Wout] = avg_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
pad, pad)
lossmh = l2_loss::forward(outmh, y)
X[i,j] = old + h
[outph, Hout, Wout] = avg_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
pad, pad)
lossph = l2_loss::forward(outph, y)
X[i,j] = old # reset
dX_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
}
}
}
}
max_pool2d_simple = function() {
/*
* Gradient check for the simple reference 2D max pooling layer.
*/
print("Grad checking the simple reference 2D max pooling layer with L2 loss.")
# Generate data
N = 2 # num examples
C = 2 # num channels
Hin = 4 # input height
Win = 4 # input width
Hf = 2 # pool filter height
Wf = 2 # pool filter width
stride = 2
X = rand(rows=N, cols=C*Hin*Win)
for (pad in 0:1) {
print(" - Grad checking w/ pad="+pad+".")
Hout = as.integer(floor((Hin + 2*pad - Hf)/stride + 1))
Wout = as.integer(floor((Win + 2*pad - Wf)/stride + 1))
y = rand(rows=N, cols=C*Hout*Wout)
# Compute analytical gradients of loss wrt parameters
[out, Hout, Wout] = max_pool2d_simple::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
dout = l2_loss::backward(out, y)
dX = max_pool2d_simple::backward(dout, Hout, Wout, X, C, Hin, Win, Hf, Wf, stride, stride,
pad, pad)
# Grad check
h = 1e-5
for (i in 1:nrow(X)) {
for (j in 1:ncol(X)) {
# Compute numerical derivative
old = as.scalar(X[i,j])
X[i,j] = old - h
[outmh, Hout, Wout] = max_pool2d_simple::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
pad, pad)
lossmh = l2_loss::forward(outmh, y)
X[i,j] = old + h
[outph, Hout, Wout] = max_pool2d_simple::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
pad, pad)
lossph = l2_loss::forward(outph, y)
X[i,j] = old # reset
dX_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
}
}
}
}
upsample2d = function() {
print("Grad checking the upsample2d layer with L2 loss.")
C=2; Hin=3; Win=3; size_h=2; size_w=2
# Generate data
N = 3 # num examples
M = C*Hin*Win # num neurons
X = rand(rows=N, cols=M, min=-5, max=5)
y = rand(rows=N, cols=M*size_h*size_w)
# Compute analytical gradients of loss wrt parameters
out = upsample2d::forward(X, C, Hin, Win, size_h, size_w)
dout = l2_loss::backward(out, y)
dX = upsample2d::backward(dout, C, Hin, Win, size_h, size_w)
# Grad check
h = 1e-5
for (i in 1:nrow(X)) {
for (j in 1:ncol(X)) {
# Compute numerical derivative
old = as.scalar(X[i,j])
X[i,j] = old - h
outmh = upsample2d::forward(X, C, Hin, Win, size_h, size_w)
lossmh = l2_loss::forward(outmh, y)
X[i,j] = old + h
outph = upsample2d::forward(X, C, Hin, Win, size_h, size_w)
lossph = l2_loss::forward(outph, y)
X[i,j] = old # reset
dX_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
}
}
}
relu = function() {
/*
* Gradient check for the ReLU nonlinearity layer.
*
* NOTE: This could result in a false-negative in which the test
* fails due to a kink being crossed in the nonlinearity. This
* occurs when the tests, f(x-h) and f(x+h), end up on opposite
* sides of the zero threshold of max(0, fx). For now, just run
* the tests again. In the future, we can explicitly check for
* this and rerun the test automatically.
*/
print("Grad checking the ReLU nonlinearity layer with L2 loss.")
# Generate data
N = 3 # num examples
M = 10 # num neurons
X = rand(rows=N, cols=M, min=-5, max=5)
y = rand(rows=N, cols=M)
# Compute analytical gradients of loss wrt parameters
out = relu::forward(X)
dout = l2_loss::backward(out, y)
dX = relu::backward(dout, X)
# Grad check
h = 1e-5
for (i in 1:nrow(X)) {
for (j in 1:ncol(X)) {
# Compute numerical derivative
old = as.scalar(X[i,j])
X[i,j] = old - h
outmh = relu::forward(X)
lossmh = l2_loss::forward(outmh, y)
X[i,j] = old + h
outph = relu::forward(X)
lossph = l2_loss::forward(outph, y)
X[i,j] = old # reset
dX_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
}
}
}
leaky_relu = function() {
/*
* Gradient check for the ReLU nonlinearity layer.
*
* NOTE: This could result in a false-negative in which the test
* fails due to a kink being crossed in the nonlinearity. This
* occurs when the tests, f(x-h) and f(x+h), end up on opposite
* sides of the zero threshold of max(0, fx). For now, just run
* the tests again. In the future, we can explicitly check for
* this and rerun the test automatically.
*/
print("Grad checking the Leaky ReLU nonlinearity layer with L2 loss.")
# Generate data
N = 3 # num examples
M = 10 # num neurons
X = rand(rows=N, cols=M, min=-5, max=5)
y = rand(rows=N, cols=M)
# Compute analytical gradients of loss wrt parameters
out = leaky_relu::forward(X)
dout = l2_loss::backward(out, y)
dX = leaky_relu::backward(dout, X)
# Grad check
h = 1e-5
for (i in 1:nrow(X)) {
for (j in 1:ncol(X)) {
# Compute numerical derivative
old = as.scalar(X[i,j])
X[i,j] = old - h
outmh = leaky_relu::forward(X)
lossmh = l2_loss::forward(outmh, y)
X[i,j] = old + h
outph = leaky_relu::forward(X)
lossph = l2_loss::forward(outph, y)
X[i,j] = old # reset
dX_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
}
}
}
rnn = function() {
/*
* Gradient check for the simple RNN layer.
*/
print("Grad checking the simple RNN layer with L2 loss.")
# Generate data
N = 3 # num examples
D = 5 # num features
T = 15 # num timesteps (sequence length)
M = 10 # num neurons
X = rand(rows=N, cols=T*D)
out0 = rand(rows=N, cols=M)
[W, b, dummy] = rnn::init(N, D, M)
# test with (1) outputs from all timesteps, and (2) output from the final timestep
for (i in 1:2) {
if (i == 1) {
return_seq = TRUE
y = rand(rows=N, cols=T*M)
}
else {
return_seq = FALSE
y = rand(rows=N, cols=M)
}
print(" - Grad checking with return_seq = " + return_seq)
# Compute analytical gradients of loss wrt parameters
[out, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
dout = l2_loss::backward(out, y)
[dX, dW, db, dout0] = rnn::backward(dout, X, W, b, T, D, return_seq, out0, cache_out)
# Grad check
h = 1e-5
print(" - Grad checking X.")
for (i in 1:nrow(X)) {
for (j in 1:ncol(X)) {
# Compute numerical derivative
old = as.scalar(X[i,j])
X[i,j] = old - h
[outmh, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
lossmh = l2_loss::forward(outmh, y)
X[i,j] = old + h
[outph, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
lossph = l2_loss::forward(outph, y)
X[i,j] = old # reset
dX_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
}
}
print(" - Grad checking W.")
for (i in 1:nrow(W)) {
for (j in 1:ncol(W)) {
# Compute numerical derivative
old = as.scalar(W[i,j])
W[i,j] = old - h
[outmh, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
lossmh = l2_loss::forward(outmh, y)
W[i,j] = old + h
[outph, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
lossph = l2_loss::forward(outph, y)
W[i,j] = old # reset
dW_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
}
}
print(" - Grad checking b.")
for (i in 1:nrow(b)) {
for (j in 1:ncol(b)) {
# Compute numerical derivative
old = as.scalar(b[i,j])
b[i,j] = old - h
[outmh, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
lossmh = l2_loss::forward(outmh, y)
b[i,j] = old + h
[outph, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
lossph = l2_loss::forward(outph, y)
b[i,j] = old # reset
db_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
}
}
print(" - Grad checking out0.")
for (i in 1:nrow(out0)) {
for (j in 1:ncol(out0)) {
# Compute numerical derivative
old = as.scalar(out0[i,j])
out0[i,j] = old - h
[outmh, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
lossmh = l2_loss::forward(outmh, y)
out0[i,j] = old + h
[outph, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
lossph = l2_loss::forward(outph, y)
out0[i,j] = old # reset
dout0_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dout0[i,j]), dout0_num, lossph, lossmh)
}
}
}
}
scale_shift1d = function() {
/*
* Gradient check for the 1D scale & shift layer.
*/
print("Grad checking the 1D scale & shift layer with L2 loss.")
# Generate data
N = 3 # num examples
D = 100 # num features
X = rand(rows=N, cols=D)
y = rand(rows=N, cols=D)
[gamma, beta] = scale_shift1d::init(D)
# Compute analytical gradients of loss wrt parameters
out = scale_shift1d::forward(X, gamma, beta)
dout = l2_loss::backward(out, y)
[dX, dgamma, dbeta] = scale_shift1d::backward(dout, out, X, gamma, beta)
# Grad check
h = 1e-5
print(" - Grad checking X.")
for (i in 1:nrow(X)) {
for (j in 1:ncol(X)) {
# Compute numerical derivative
old = as.scalar(X[i,j])
X[i,j] = old - h
outmh = scale_shift1d::forward(X, gamma, beta)
lossmh = l2_loss::forward(outmh, y)
X[i,j] = old + h
outph = scale_shift1d::forward(X, gamma, beta)
lossph = l2_loss::forward(outph, y)
X[i,j] = old # reset
dX_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
}
}
print(" - Grad checking gamma.")
for (i in 1:nrow(gamma)) {
for (j in 1:ncol(gamma)) {
# Compute numerical derivative
old = as.scalar(gamma[i,j])
gamma[i,j] = old - h
outmh = scale_shift1d::forward(X, gamma, beta)
lossmh = l2_loss::forward(outmh, y)
gamma[i,j] = old + h
outph = scale_shift1d::forward(X, gamma, beta)
lossph = l2_loss::forward(outph, y)
gamma[i,j] = old # reset
dgamma_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dgamma[i,j]), dgamma_num,
lossph, lossmh)
}
}
print(" - Grad checking beta.")
for (i in 1:nrow(beta)) {
for (j in 1:ncol(beta)) {
# Compute numerical derivative
old = as.scalar(beta[i,j])
beta[i,j] = old - h
outmh = scale_shift1d::forward(X, gamma, beta)
lossmh = l2_loss::forward(outmh, y)
beta[i,j] = old + h
outph = scale_shift1d::forward(X, gamma, beta)
lossph = l2_loss::forward(outph, y)
beta[i,j] = old # reset
dbeta_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dbeta[i,j]), dbeta_num,
lossph, lossmh)
}
}
}
scale_shift2d = function() {
/*
* Gradient check for the 2D scale & shift layer.
*/
print("Grad checking the 2D scale & shift layer with L2 loss.")
# Generate data
N = 3 # num examples
C = 2 # num channels
Hin = 5 # input height
Win = 5 # input width
X = rand(rows=N, cols=C*Hin*Win)
y = rand(rows=N, cols=C*Hin*Win)
[gamma, beta] = scale_shift2d::init(C)
# Compute analytical gradients of loss wrt parameters
out = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
dout = l2_loss::backward(out, y)
[dX, dgamma, dbeta] = scale_shift2d::backward(dout, out, X, gamma, beta, C, Hin, Win)
# Grad check
h = 1e-5
print(" - Grad checking X.")
for (i in 1:nrow(X)) {
for (j in 1:ncol(X)) {
# Compute numerical derivative
old = as.scalar(X[i,j])
X[i,j] = old - h
outmh = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
lossmh = l2_loss::forward(outmh, y)
X[i,j] = old + h
outph = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
lossph = l2_loss::forward(outph, y)
X[i,j] = old # reset
dX_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
}
}
print(" - Grad checking gamma.")
for (i in 1:nrow(gamma)) {
for (j in 1:ncol(gamma)) {
# Compute numerical derivative
old = as.scalar(gamma[i,j])
gamma[i,j] = old - h
outmh = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
lossmh = l2_loss::forward(outmh, y)
gamma[i,j] = old + h
outph = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
lossph = l2_loss::forward(outph, y)
gamma[i,j] = old # reset
dgamma_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dgamma[i,j]), dgamma_num,
lossph, lossmh)
}
}
print(" - Grad checking beta.")
for (i in 1:nrow(beta)) {
for (j in 1:ncol(beta)) {
# Compute numerical derivative
old = as.scalar(beta[i,j])
beta[i,j] = old - h
outmh = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
lossmh = l2_loss::forward(outmh, y)
beta[i,j] = old + h
outph = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
lossph = l2_loss::forward(outph, y)
beta[i,j] = old # reset
dbeta_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dbeta[i,j]), dbeta_num,
lossph, lossmh)
}
}
}
sigmoid = function() {
/*
* Gradient check for the sigmoid nonlinearity layer.
*/
print("Grad checking the sigmoid nonlinearity layer with L2 loss.")
# Generate data
N = 3 # num examples
M = 10 # num neurons
X = rand(rows=N, cols=M)
y = rand(rows=N, cols=M)
# Compute analytical gradients of loss wrt parameters
out = sigmoid::forward(X)
dout = l2_loss::backward(out, y)
dX = sigmoid::backward(dout, X)
# Grad check
h = 1e-5
for (i in 1:nrow(X)) {
for (j in 1:ncol(X)) {
# Compute numerical derivative
old = as.scalar(X[i,j])
X[i,j] = old - h
outmh = sigmoid::forward(X)
lossmh = l2_loss::forward(outmh, y)
X[i,j] = old + h
outph = sigmoid::forward(X)
lossph = l2_loss::forward(outph, y)
X[i,j] = old # reset
dX_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
}
}
}
softmax = function() {
/*
* Gradient check for the softmax layer.
*/
print("Grad checking the softmax layer with L2 loss.")
# Generate data
N = 3 # num examples
D = 10 # num classes
X = rand(rows=N, cols=D)
y = rand(rows=N, cols=D, min=0, max=1, pdf="uniform")
y = y / rowSums(y)
# Compute analytical gradients of loss wrt parameters
out = softmax::forward(X)
dout = l2_loss::backward(out, y)
dX = softmax::backward(dout, X)
# Grad check
h = 1e-5
for (i in 1:nrow(X)) {
for (j in 1:ncol(X)) {
# Compute numerical derivative
old = as.scalar(X[i,j])
X[i,j] = old - h
outmh = softmax::forward(X)
lossmh = l2_loss::forward(outmh, y)
X[i,j] = old + h
outph = softmax::forward(X)
lossph = l2_loss::forward(outph, y)
X[i,j] = old # reset
dX_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
}
}
}
softmax2d = function() {
/*
* Gradient check for the 2D softmax layer.
*/
print("Grad checking the 2D softmax layer with L2 loss.")
# Generate data
N = 3 # num examples
C = 10 # num classes
Hin = 5 # example height
Win = 5 # example width
X = rand(rows=N, cols=C*Hin*Win)
y = rand(rows=N, cols=C*Hin*Win, min=0, max=1, pdf="uniform")
y_C_NHW = util::transpose_NCHW_to_CNHW(y, C)
y_NHW_C = t(y_C_NHW)
y_NHW_C = y_NHW_C / rowSums(y_NHW_C)
# Compute analytical gradients of loss wrt parameters
out = softmax2d::forward(X, C)
out_C_NHW = util::transpose_NCHW_to_CNHW(out, C)
out_NHW_C = t(out_C_NHW)
dout_NHW_C = l2_loss::backward(out_NHW_C, y_NHW_C)
dout_C_NHW = t(dout_NHW_C)
dout = util::transpose_NCHW_to_CNHW(dout_C_NHW, N)
dX = softmax2d::backward(dout, X, C)
# Grad check
h = 1e-5
for (i in 1:nrow(X)) {
for (j in 1:ncol(X)) {
# Compute numerical derivative
old = as.scalar(X[i,j])
X[i,j] = old - h
outmh = softmax2d::forward(X, C)
outmh_C_NHW = util::transpose_NCHW_to_CNHW(outmh, C)
outmh_NHW_C = t(outmh_C_NHW)
lossmh = l2_loss::forward(outmh_NHW_C, y_NHW_C)
X[i,j] = old + h
outph = softmax2d::forward(X, C)
outph_C_NHW = util::transpose_NCHW_to_CNHW(outph, C)
outph_NHW_C = t(outph_C_NHW)
lossph = l2_loss::forward(outph_NHW_C, y_NHW_C)
X[i,j] = old # reset
dX_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
}
}
}
tanh = function() {
/*
* Gradient check for the hyperbolic tangent (tanh) nonlinearity
* layer.
*/
print("Grad checking the tanh nonlinearity layer with L2 loss.")
# Generate data
N = 3 # num examples
M = 10 # num neurons
X = rand(rows=N, cols=M)
y = rand(rows=N, cols=M)
# Compute analytical gradients of loss wrt parameters
out = tanh::forward(X)
dout = l2_loss::backward(out, y)
dX = tanh::backward(dout, X)
# Grad check
h = 1e-5
for (i in 1:nrow(X)) {
for (j in 1:ncol(X)) {
# Compute numerical derivative
old = as.scalar(X[i,j])
X[i,j] = old - h
outmh = tanh::forward(X)
lossmh = l2_loss::forward(outmh, y)
X[i,j] = old + h
outph = tanh::forward(X)
lossph = l2_loss::forward(outph, y)
X[i,j] = old # reset
dX_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
}
}
}
two_layer_affine_l2_net = function() {
/*
* Gradient check for a two-layer, fully-connected, feed-forward
* network with ReLU nonlinearity and L2 loss.
*
* NOTE: This could result in a false-negative in which the test
* fails due to a kink being crossed in the ReLU nonlinearity. This
* occurs when the tests, f(x-h) and f(x+h), end up on opposite
* sides of the zero threshold of max(0, fx). For now, just run
* the tests again. In the future, we can explicitly check for
* this and rerun the test automatically.
*/
print("Grad checking a two-layer, fully-connected, feed-forward network with a ReLU " +
"nonlinearity, and an L2 loss function.")
# Generate input data
N = 1000 # num examples
D = 100 # num features
yD = 5 # num targets
X = rand(rows=N, cols=D, pdf="normal")
y = rand(rows=N, cols=yD)
# Create 2-layer, fully-connected network
M = 10 # number of hidden neurons
[W1, b1] = affine::init(D, M)
[W2, b2] = affine::init(M, yD)
W2 = W2 / sqrt(2) # different initialization, since being fed into l2 loss, instead of relu
# Optimize for short "burn-in" time to move to characteristic
# mode of operation and unmask any real issues.
print(" - Burn-in:")
lr = 0.01
decay = 0.99
for(i in 1:5) {
# Compute forward and backward passes of net
[pred, loss, dX, dW1, db1, dW2, db2] = two_layer_affine_l2_net_run(X, y, W1, b1, W2, b2)
print(" - L2 loss: " + loss)
# Optimize with basic SGD
W1 = W1 - lr * dW1
b1 = b1 - lr * db1
W2 = W2 - lr * dW2
b2 = b2 - lr * db2
lr = lr * decay
}
# Compute analytical gradients
[pred, loss, dX, dW1, db1, dW2, db2] = two_layer_affine_l2_net_run(X, y, W1, b1, W2, b2)
# Grad check
h = 1e-6
print(" - Grad checking X.")
for (i in 1:2) {
for (j in 1:ncol(X)) {
# Compute numerical derivative
old_x = as.scalar(X[i,j])
X[i,j] = old_x - h
[lossmh, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
X[i,j] = old_x + h
[lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
X[i,j] = old_x # reset X[i,j]
dX_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
}
}
print(" - Grad checking W1.")
for (i in 1:nrow(W1)) {
for (j in 1:ncol(W1)) {
# Compute numerical derivative
old_w = as.scalar(W1[i,j])
W1[i,j] = old_w - h
[lossmh, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
W1[i,j] = old_w + h
[lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
W1[i,j] = old_w # reset W[i,j]
dWij_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dW1[i,j]), dWij_num, lossph, lossmh)
}
}
print(" - Grad checking W2.")
for (i in 1:nrow(W2)) {
for (j in 1:ncol(W2)) {
# Compute numerical derivative
old_w = as.scalar(W2[i,j])
W2[i,j] = old_w - h
[lossmh, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
W2[i,j] = old_w + h
[lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
W2[i,j] = old_w # reset W[i,j]
dWij_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dW2[i,j]), dWij_num, lossph, lossmh)
}
}
print(" - Grad checking b1.")
for (i in 1:nrow(b1)) {
for (j in 1:ncol(b1)) {
# Compute numerical derivative
old_b = as.scalar(b1[i,j])
b1[i,j] = old_b - h
[lossmh, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
b1[i,j] = old_b + h
[lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
b1[i,j] = old_b # reset b[1,j]
dbij_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(db1[i,j]), dbij_num, lossph, lossmh)
}
}
print(" - Grad checking b2.")
for (i in 1:nrow(b2)) {
for (j in 1:ncol(b2)) {
# Compute numerical derivative
old_b = as.scalar(b2[i,j])
b2[i,j] = old_b - h
[lossmh, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
b2[i,j] = old_b + h
[lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
b2[i,j] = old_b # reset b[1,j]
dbij_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(db2[i,j]), dbij_num, lossph, lossmh)
}
}
}
/*
* Test network with forward/backward functions.
*/
two_layer_affine_l2_net_run = function(matrix[double] X, matrix[double] y,
matrix[double] W1, matrix[double] b1,
matrix[double] W2, matrix[double] b2)
return (matrix[double] pred, double loss,
matrix[double] dX,
matrix[double] dW1, matrix[double] db1,
matrix[double] dW2, matrix[double] db2) {
# Compute forward pass
[loss, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
# Compute backward pass
[dX, dpred, daout, dhout, dW1, db1, dW2, db2] =
two_layer_affine_l2_net_backward(X, y, pred, aout, hout, W1, b1, W2, b2)
}
two_layer_affine_l2_net_forward = function(matrix[double] X, matrix[double] y,
matrix[double] W1, matrix[double] b1,
matrix[double] W2, matrix[double] b2)
return (double loss, matrix[double] pred, matrix[double] aout, matrix[double] hout) {
# Compute forward pass
hout = affine::forward(X, W1, b1)
aout = relu::forward(hout)
pred = affine::forward(aout, W2, b2)
# Compute loss
loss = l2_loss::forward(pred, y)
}
two_layer_affine_l2_net_backward = function(matrix[double] X, matrix[double] y, matrix[double] pred,
matrix[double] aout, matrix[double] hout,
matrix[double] W1, matrix[double] b1,
matrix[double] W2, matrix[double] b2)
return (matrix[double] dX, matrix[double] dpred,
matrix[double] daout, matrix[double] dhout,
matrix[double] dW1, matrix[double] db1, matrix[double] dW2, matrix[double] db2) {
# Compute backward pass
dpred = l2_loss::backward(pred, y)
[daout, dW2, db2] = affine::backward(dpred, aout, W2, b2)
dhout = relu::backward(daout, hout)
[dX, dW1, db1] = affine::backward(dhout, X, W1, b1)
}
elu = function() {
/*
* Gradient check for ELU nonlinearity
* layer.
*/
print("Grad checking ELU nonlinearity layer with L2 loss.")
# Generate data
N = 3 # num examples
M = 10 # num neurons
X = rand(rows=N, cols=M, min=-5, max=5)
y = rand(rows=N, cols=M)
out = elu::forward(X, 1)
dout = l2_loss::backward(out, y)
dX = elu::backward(dout, X, 1)
# Grad check
h = 1e-5
print(" - Grad checking X.")
for (i in 1:nrow(X)) {
for (j in 1:ncol(X)) {
# Compute numerical derivative
old = as.scalar(X[i,j])
X[i,j] = old - h
outmh = elu::forward(X, 1)
lossmh = l2_loss::forward(outmh, y)
X[i,j] = old + h
outph = elu::forward(X, 1)
lossph = l2_loss::forward(outph, y)
X[i,j] = old # reset
dX_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
}
}
}