scripts/nn/test/grad_check.dml - systemds - Git at Google

 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 #-------------------------------------------------------------

 /*
  * Gradient checks for various architectures.
  */
 source("scripts/nn/layers/affine.dml") as affine
 source("scripts/nn/layers/low_rank_affine.dml") as low_rank_affine
 source("scripts/nn/layers/batch_norm1d.dml") as batch_norm1d
 source("scripts/nn/layers/batch_norm2d.dml") as batch_norm2d
 source("scripts/nn/layers/conv2d.dml") as conv2d
 source("scripts/nn/layers/conv2d_builtin.dml") as conv2d_builtin
 source("scripts/nn/layers/conv2d_depthwise.dml") as conv2d_depthwise
 source("scripts/nn/layers/conv2d_transpose.dml") as conv2d_transpose
 source("scripts/nn/layers/conv2d_transpose_depthwise.dml") as conv2d_transpose_depthwise
 source("scripts/nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
 source("scripts/nn/layers/cross_entropy_loss2d.dml") as cross_entropy_loss2d
 source("scripts/nn/layers/dropout.dml") as dropout
 source("scripts/nn/layers/fm.dml") as fm
 source("scripts/nn/layers/l1_loss.dml") as l1_loss
 source("scripts/nn/layers/l1_reg.dml") as l1_reg
 source("scripts/nn/layers/l2_loss.dml") as l2_loss
 source("scripts/nn/layers/l2_reg.dml") as l2_reg
 source("scripts/nn/layers/log_loss.dml") as log_loss
 source("scripts/nn/layers/lstm.dml") as lstm
 source("scripts/nn/layers/max_pool2d.dml") as max_pool2d
 source("scripts/nn/layers/max_pool2d_builtin.dml") as max_pool2d_builtin
 source("scripts/nn/layers/avg_pool2d_builtin.dml") as avg_pool2d_builtin
 source("scripts/nn/layers/upsample2d.dml") as upsample2d
 source("scripts/nn/layers/relu.dml") as relu
 source("scripts/nn/layers/leaky_relu.dml") as leaky_relu
 source("scripts/nn/layers/rnn.dml") as rnn
 source("scripts/nn/layers/scale_shift1d.dml") as scale_shift1d
 source("scripts/nn/layers/scale_shift2d.dml") as scale_shift2d
 source("scripts/nn/layers/sigmoid.dml") as sigmoid
 source("scripts/nn/layers/softmax.dml") as softmax
 source("scripts/nn/layers/softmax2d.dml") as softmax2d
 source("scripts/nn/layers/tanh.dml") as tanh
 source("scripts/nn/test/conv2d_simple.dml") as conv2d_simple
 source("scripts/nn/test/max_pool2d_simple.dml") as max_pool2d_simple
 source("scripts/nn/test/util.dml") as test_util
 source("scripts/nn/util.dml") as util
 source("scripts/nn/layers/elu.dml") as elu

 affine = function() {
   /*
    * Gradient check for the affine layer.
    */
   print("Grad checking the affine layer with L2 loss.")

   # Generate data
   N = 3 # num examples
   D = 100 # num features
   M = 10 # num neurons
   X = rand(rows=N, cols=D)
   y = rand(rows=N, cols=M)
   [W, b] = affine::init(D, M)

   # Compute analytical gradients of loss wrt parameters
   out = affine::forward(X, W, b)
   dout = l2_loss::backward(out, y)
   [dX, dW, db] = affine::backward(dout, X, W, b)

   # Grad check
   h = 1e-5
   print(" - Grad checking X.")
   for (i in 1:nrow(X)) {
     for (j in 1:ncol(X)) {
       # Compute numerical derivative
       old = as.scalar(X[i,j])
       X[i,j] = old - h
       outmh = affine::forward(X, W, b)
       lossmh = l2_loss::forward(outmh, y)
       X[i,j] = old + h
       outph = affine::forward(X, W, b)
       lossph = l2_loss::forward(outph, y)
       X[i,j] = old  # reset
       dX_num = (lossph-lossmh) / (2*h)  # numerical derivative

       # Check error
       rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
     }
   }

   print(" - Grad checking W.")
   for (i in 1:nrow(W)) {
     for (j in 1:ncol(W)) {
       # Compute numerical derivative
       old = as.scalar(W[i,j])
       W[i,j] = old - h
       outmh = affine::forward(X, W, b)
       lossmh = l2_loss::forward(outmh, y)
       W[i,j] = old + h
       outph = affine::forward(X, W, b)
       lossph = l2_loss::forward(outph, y)
       W[i,j] = old  # reset
       dW_num = (lossph-lossmh) / (2*h)  # numerical derivative

       # Check error
       rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
     }
   }

   print(" - Grad checking b.")
   for (i in 1:nrow(b)) {
     for (j in 1:ncol(b)) {
       # Compute numerical derivative
       old = as.scalar(b[i,j])
       b[i,j] = old - h
       outmh = affine::forward(X, W, b)
       lossmh = l2_loss::forward(outmh, y)
       b[i,j] = old + h
       outph = affine::forward(X, W, b)
       lossph = l2_loss::forward(outph, y)
       b[i,j] = old  # reset
       db_num = (lossph-lossmh) / (2*h)  # numerical derivative

       # Check error
       rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
     }
   }
 }

 low_rank_affine = function() {
   /*
    * Gradient check for the low rank affine layer.
    */
   print("Grad checking the low rank affine layer with L2 loss.")

   # Generate data
   N = 3 # num examples
   D = 100 # num features
   M = 10 # num neurons
   R = 2 # rank
   X = rand(rows=N, cols=D)
   y = rand(rows=N, cols=M)
   [U, V, b] = low_rank_affine::init(D, M, R)

   # Compute analytical gradients of loss wrt parameters
   out = low_rank_affine::forward(X, U, V, b)
   dout = l2_loss::backward(out, y)
   [dX, dU, dV, db] = low_rank_affine::backward(dout, X, U, V, b)

   # Grad check
   h = 1e-5
   print(" - Grad checking X.")
   for (i in 1:nrow(X)) {
     for (j in 1:ncol(X)) {
       # Compute numerical derivative
       old = as.scalar(X[i,j])
       X[i,j] = old - h
       outmh = low_rank_affine::forward(X, U, V, b)
       lossmh = l2_loss::forward(outmh, y)
       X[i,j] = old + h
       outph = low_rank_affine::forward(X, U, V, b)
       lossph = l2_loss::forward(outph, y)
       X[i,j] = old  # reset
       dX_num = (lossph-lossmh) / (2*h)  # numerical derivative

       # Check error
       rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
     }
   }

   print(" - Grad checking U.")
   for (i in 1:nrow(U)) {
     for (j in 1:ncol(U)) {
       # Compute numerical derivative
       old = as.scalar(U[i,j])
       U[i,j] = old - h
       outmh = low_rank_affine::forward(X, U, V, b)
       lossmh = l2_loss::forward(outmh, y)
       U[i,j] = old + h
       outph = low_rank_affine::forward(X, U, V, b)
       lossph = l2_loss::forward(outph, y)
       U[i,j] = old  # reset
       dU_num = (lossph-lossmh) / (2*h)  # numerical derivative

       # Check error
       rel_error = test_util::check_rel_grad_error(as.scalar(dU[i,j]), dU_num, lossph, lossmh)
     }
   }

   print(" - Grad checking V.")
   for (i in 1:nrow(V)) {
     for (j in 1:ncol(V)) {
       # Compute numerical derivative
       old = as.scalar(V[i,j])
       V[i,j] = old - h
       outmh = low_rank_affine::forward(X, U, V, b)
       lossmh = l2_loss::forward(outmh, y)
       V[i,j] = old + h
       outph = low_rank_affine::forward(X, U, V, b)
       lossph = l2_loss::forward(outph, y)
       V[i,j] = old  # reset
       dV_num = (lossph-lossmh) / (2*h)  # numerical derivative

       # Check error
       rel_error = test_util::check_rel_grad_error(as.scalar(dV[i,j]), dV_num, lossph, lossmh)
     }
   }

   print(" - Grad checking b.")
   for (i in 1:nrow(b)) {
     for (j in 1:ncol(b)) {
       # Compute numerical derivative
       old = as.scalar(b[i,j])
       b[i,j] = old - h
       outmh = low_rank_affine::forward(X, U, V, b)
       lossmh = l2_loss::forward(outmh, y)
       b[i,j] = old + h
       outph = low_rank_affine::forward(X, U, V, b)
       lossph = l2_loss::forward(outph, y)
       b[i,j] = old  # reset
       db_num = (lossph-lossmh) / (2*h)  # numerical derivative

       # Check error
       rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
     }
   }
 }

 batch_norm1d = function() {
   /*
    * Gradient check for the 1D batch normalization layer.
    */
   print("Grad checking the 1D batch normalization layer with L2 loss.")

   # Generate data
   N = 3 # num examples
   D = 100 # num features
   mu = 0.9  # momentum
   eps = 1e-5  # epsilon
   X = rand(rows=N, cols=D)
   y = rand(rows=N, cols=D)
   gamma = rand(rows=1, cols=D)
   beta = rand(rows=1, cols=D)
   ema_mean = rand(rows=1, cols=D)
   ema_var = rand(rows=1, cols=D)
   #[dummy, dummy, ema_mean, ema_var] = batch_norm1d::init(D)

   # Check training & testing modes
   for (i in 1:2) {
     if (i == 1)
       mode = 'train'
     else
       mode = 'test'
     print(" - Grad checking the '"+mode+"' mode.")

     # Compute analytical gradients of loss wrt parameters
     [out, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
         batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
     dout = l2_loss::backward(out, y)
     [dX, dgamma, dbeta] = batch_norm1d::backward(dout, out, ema_mean_upd, ema_var_upd,
                                                  cache_mean, cache_var, cache_norm,
                                                  X, gamma, beta, mode, ema_mean, ema_var, mu, eps)

     # Grad check
     h = 1e-5
     print("   - Grad checking X.")
     for (i in 1:nrow(X)) {
       for (j in 1:ncol(X)) {
         # Compute numerical derivative
         old = as.scalar(X[i,j])
         X[i,j] = old - h
         [outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
             batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
         lossmh = l2_loss::forward(outmh, y)
         X[i,j] = old + h
         [outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
             batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
         lossph = l2_loss::forward(outph, y)
         X[i,j] = old  # reset
         dX_num = (lossph-lossmh) / (2*h)  # numerical derivative

         # Check error
         rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
       }
     }

     print("   - Grad checking gamma.")
     for (i in 1:nrow(gamma)) {
       for (j in 1:ncol(gamma)) {
         # Compute numerical derivative
         old = as.scalar(gamma[i,j])
         gamma[i,j] = old - h
         [outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
             batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
         lossmh = l2_loss::forward(outmh, y)
         gamma[i,j] = old + h
         [outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
             batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
         lossph = l2_loss::forward(outph, y)
         gamma[i,j] = old  # reset
         dgamma_num = (lossph-lossmh) / (2*h)  # numerical derivative

         # Check error
         rel_error = test_util::check_rel_grad_error(as.scalar(dgamma[i,j]), dgamma_num,
                                                     lossph, lossmh)
       }
     }

     print("   - Grad checking beta.")
     for (i in 1:nrow(beta)) {
       for (j in 1:ncol(beta)) {
         # Compute numerical derivative
         old = as.scalar(beta[i,j])
         beta[i,j] = old - h
         [outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
             batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
         lossmh = l2_loss::forward(outmh, y)
         beta[i,j] = old + h
         [outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
             batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
         lossph = l2_loss::forward(outph, y)
         beta[i,j] = old  # reset
         dbeta_num = (lossph-lossmh) / (2*h)  # numerical derivative

         # Check error
         rel_error = test_util::check_rel_grad_error(as.scalar(dbeta[i,j]), dbeta_num,
                                                     lossph, lossmh)
       }
     }
   }
 }

 batch_norm2d = function() {
   /*
    * Gradient check for the 2D (spatial) batch normalization layer.
    */
   print("Grad checking the 2D (spatial) batch normalization layer with L2 loss.")

   # Generate data
   N = 3 # num examples
   C = 2  # num channels
   Hin = 5  # input height
   Win = 5  # input width
   mu = 0.9  # momentum
   eps = 1e-5  # epsilon
   X = rand(rows=N, cols=C*Hin*Win)
   y = rand(rows=N, cols=C*Hin*Win)
   gamma = rand(rows=C, cols=1)
   beta = rand(rows=C, cols=1)
   ema_mean = rand(rows=C, cols=1)
   ema_var = rand(rows=C, cols=1)
   #[dummy, dummy, ema_mean, ema_var] = batch_norm2d::init(C)

   # Check training & testing modes
   # for (i in 1:1) {
     mode = 'train'
     print(" - Grad checking the '"+mode+"' mode.")

     # Compute analytical gradients of loss wrt parameters
     [out, ema_mean_upd, ema_var_upd, cache_mean, cache_var] =
         batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
     dout = l2_loss::backward(out, y)
     [dX, dgamma, dbeta] = batch_norm2d::backward(dout, cache_mean, cache_var,
                                                  X, gamma, C, Hin, Win, eps)

     # Grad check
     h = 1e-5
     print("   - Grad checking X.")
     for (i in 1:nrow(X)) {
       for (j in 1:ncol(X)) {
         # Compute numerical derivative
         old = as.scalar(X[i,j])
         X[i,j] = old - h
         [outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var] =
             batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
         lossmh = l2_loss::forward(outmh, y)
         X[i,j] = old + h
         [outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var] =
             batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
         lossph = l2_loss::forward(outph, y)
         X[i,j] = old  # reset
         dX_num = (lossph-lossmh) / (2*h)  # numerical derivative

         # Check error
         rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
       }
     }

     print("   - Grad checking gamma.")
     for (i in 1:nrow(gamma)) {
       for (j in 1:ncol(gamma)) {
         # Compute numerical derivative
         old = as.scalar(gamma[i,j])
         gamma[i,j] = old - h
         [outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var] =
             batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
         lossmh = l2_loss::forward(outmh, y)
         gamma[i,j] = old + h
         [outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var] =
             batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
         lossph = l2_loss::forward(outph, y)
         gamma[i,j] = old  # reset
         dgamma_num = (lossph-lossmh) / (2*h)  # numerical derivative

         # Check error
         rel_error = test_util::check_rel_grad_error(as.scalar(dgamma[i,j]), dgamma_num,
                                                     lossph, lossmh)
       }
     }

     print("   - Grad checking beta.")
     for (i in 1:nrow(beta)) {
       for (j in 1:ncol(beta)) {
         # Compute numerical derivative
         old = as.scalar(beta[i,j])
         beta[i,j] = old - h
         [outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var] =
             batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
         lossmh = l2_loss::forward(outmh, y)
         beta[i,j] = old + h
         [outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var] =
             batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
         lossph = l2_loss::forward(outph, y)
         beta[i,j] = old  # reset
         dbeta_num = (lossph-lossmh) / (2*h)  # numerical derivative

         # Check error
         rel_error = test_util::check_rel_grad_error(as.scalar(dbeta[i,j]), dbeta_num,
                                                     lossph, lossmh)
       }
     }
   # }
 }

 conv2d = function() {
   /*
    * Gradient check for the 2D convolutional layer using `im2col`.
    */
   print("Grad checking the `im2col` 2D convolutional layer with L2 loss.")

   # Generate data
   N = 2  # num examples
   C = 3  # num channels
   Hin = 5  # input height
   Win = 5  # input width
   F = 4  # num filters
   Hf = 3  # filter height
   Wf = 3  # filter width
   stride = 1
   pad = 1
   X = rand(rows=N, cols=C*Hin*Win)
   y = rand(rows=N, cols=F*Hin*Win)

   # Create layers
   [W, b] = conv2d::init(F, C, Hf, Wf)

   # Compute analytical gradients of loss wrt parameters
   [out, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
   dout = l2_loss::backward(out, y)
   [dX, dW, db] = conv2d::backward(dout, Hout, Wout, X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
                                   pad, pad)

   # Grad check
   h = 1e-5
   print(" - Grad checking X.")
   for (i in 1:nrow(X)) {
     for (j in 1:ncol(X)) {
       # Compute numerical derivative
       old = as.scalar(X[i,j])
       X[i,j] = old - h
       [outmh, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
       lossmh = l2_loss::forward(outmh, y)
       X[i,j] = old + h
       [outph, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
       lossph = l2_loss::forward(outph, y)
       X[i,j] = old  # reset
       dX_num = (lossph-lossmh) / (2*h)  # numerical derivative

       # Check error
       rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
     }
   }

   print(" - Grad checking W.")
   for (i in 1:nrow(W)) {
     for (j in 1:ncol(W)) {
       # Compute numerical derivative
       old = as.scalar(W[i,j])
       W[i,j] = old - h
       [outmh, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
       lossmh = l2_loss::forward(outmh, y)
       W[i,j] = old + h
       [outph, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
       lossph = l2_loss::forward(outph, y)
       W[i,j] = old  # reset
       dW_num = (lossph-lossmh) / (2*h)  # numerical derivative

       # Check error
       rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
     }
   }

   print(" - Grad checking b.")
   for (i in 1:nrow(b)) {
     for (j in 1:ncol(b)) {
       # Compute numerical derivative
       old = as.scalar(b[i,j])
       b[i,j] = old - h
       [outmh, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
       lossmh = l2_loss::forward(outmh, y)
       b[i,j] = old + h
       [outph, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
       lossph = l2_loss::forward(outph, y)
       b[i,j] = old  # reset
       db_num = (lossph-lossmh) / (2*h)  # numerical derivative

       # Check error
       rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
     }
   }
 }

 conv2d_builtin = function() {
   /*
    * Gradient check for the 2D convolutional layer using built-in
    * functions.
    */
   print("Grad checking the built-in 2D convolutional layer with L2 loss.")

   # Generate data
   N = 2  # num examples
   C = 3  # num channels
   Hin = 5  # input height
   Win = 5  # input width
   F = 4  # num filters
   Hf = 3  # filter height
   Wf = 3  # filter width
   stride = 1
   pad = 1
   X = rand(rows=N, cols=C*Hin*Win)
   y = rand(rows=N, cols=F*Hin*Win)

   # Create layers
   [W, b] = conv2d_builtin::init(F, C, Hf, Wf)

   # Compute analytical gradients of loss wrt parameters
   [out, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
                                               pad, pad)
   dout = l2_loss::backward(out, y)
   [dX, dW, db] = conv2d_builtin::backward(dout, Hout, Wout, X, W, b, C, Hin, Win, Hf, Wf,
                                           stride, stride, pad, pad)

   # Grad check
   h = 1e-5
   print(" - Grad checking X.")
   for (i in 1:nrow(X)) {
     for (j in 1:ncol(X)) {
       # Compute numerical derivative
       old = as.scalar(X[i,j])
       X[i,j] = old - h
       [outmh, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
                                                     pad, pad)
       lossmh = l2_loss::forward(outmh, y)
       X[i,j] = old + h
       [outph, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
                                                     pad, pad)
       lossph = l2_loss::forward(outph, y)
       X[i,j] = old  # reset
       dX_num = (lossph-lossmh) / (2*h)  # numerical derivative

       # Check error
       rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
     }
   }

   print(" - Grad checking W.")
   for (i in 1:nrow(W)) {
     for (j in 1:ncol(W)) {
       # Compute numerical derivative
       old = as.scalar(W[i,j])
       W[i,j] = old - h
       [outmh, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
                                                     pad, pad)
       lossmh = l2_loss::forward(outmh, y)
       W[i,j] = old + h
       [outph, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
                                                     pad, pad)
       lossph = l2_loss::forward(outph, y)
       W[i,j] = old  # reset
       dW_num = (lossph-lossmh) / (2*h)  # numerical derivative

       # Check error
       rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
     }
   }

   print(" - Grad checking b.")
   for (i in 1:nrow(b)) {
     for (j in 1:ncol(b)) {
       # Compute numerical derivative
       old = as.scalar(b[i,j])
       b[i,j] = old - h
       [outmh, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
                                                     pad, pad)
       lossmh = l2_loss::forward(outmh, y)
       b[i,j] = old + h
       [outph, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
                                                     pad, pad)
       lossph = l2_loss::forward(outph, y)
       b[i,j] = old  # reset
       db_num = (lossph-lossmh) / (2*h)  # numerical derivative

       # Check error
       rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
     }
   }
 }

 conv2d_simple = function() {
   /*
    * Gradient check for the simple reference 2D convolutional layer.
    */
   print("Grad checking the simple reference 2D convolutional layer with L2 loss.")

   # Generate data
   N = 2  # num examples
   C = 3  # num channels
   Hin = 5  # input height
   Win = 5  # input width
   F = 4  # num filters
   Hf = 3  # filter height
   Wf = 3  # filter width
   stride = 1
   pad = 1
   X = rand(rows=N, cols=C*Hin*Win)
   y = rand(rows=N, cols=F*Hin*Win)

   # Create layers
   [W, b] = conv2d_simple::init(F, C, Hf, Wf)

   # Compute analytical gradients of loss wrt parameters
   [out, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
   dout = l2_loss::backward(out, y)
   [dX, dW, db] = conv2d_simple::backward(dout, Hout, Wout, X, W, b, C, Hin, Win, Hf, Wf,
                                          stride, stride, pad, pad)

   # Grad check
   h = 1e-5
   print(" - Grad checking X.")
   for (i in 1:nrow(X)) {
     for (j in 1:ncol(X)) {
       # Compute numerical derivative
       old = as.scalar(X[i,j])
       X[i,j] = old - h
       [outmh, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
                                                    pad, pad)
       lossmh = l2_loss::forward(outmh, y)
       X[i,j] = old + h
       [outph, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
                                                    pad, pad)
       lossph = l2_loss::forward(outph, y)
       X[i,j] = old  # reset
       dX_num = (lossph-lossmh) / (2*h)  # numerical derivative

       # Check error
       rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
     }
   }

   print(" - Grad checking W.")
   for (i in 1:nrow(W)) {
     for (j in 1:ncol(W)) {
       # Compute numerical derivative
       old = as.scalar(W[i,j])
       W[i,j] = old - h
       [outmh, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
                                                    pad, pad)
       lossmh = l2_loss::forward(outmh, y)
       W[i,j] = old + h
       [outph, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
                                                    pad, pad)
       lossph = l2_loss::forward(outph, y)
       W[i,j] = old  # reset
       dW_num = (lossph-lossmh) / (2*h)  # numerical derivative

       # Check error
       rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
     }
   }

   print(" - Grad checking b.")
   for (i in 1:nrow(b)) {
     for (j in 1:ncol(b)) {
       # Compute numerical derivative
       old = as.scalar(b[i,j])
       b[i,j] = old - h
       [outmh, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
                                                    pad, pad)
       lossmh = l2_loss::forward(outmh, y)
       b[i,j] = old + h
       [outph, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
                                                    pad, pad)
       lossph = l2_loss::forward(outph, y)
       b[i,j] = old  # reset
       db_num = (lossph-lossmh) / (2*h)  # numerical derivative

       # Check error
       rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
     }
   }
 }

 conv2d_depthwise = function() {
   /*
    * Gradient check for the 2D depthwise convolutional layer.
    */
   print("Grad checking the 2D depthwise convolutional layer with L2 loss.")

   # Generate data
   N = 2  # num examples
   C = 3  # num channels
   Hin = 5  # input height
   Win = 5  # input width
   M = 4  # depth multiplier
   Hf = 3  # filter height
   Wf = 3  # filter width
   stride = 1
   pad = 1
   X = rand(rows=N, cols=C*Hin*Win)
   y = rand(rows=N, cols=C*M*Hin*Win)

   # Create layers
   [W, b] = conv2d_depthwise::init(C, M, Hf, Wf)

   # Compute analytical gradients of loss wrt parameters
   [out, Hout, Wout] = conv2d_depthwise::forward(X, W, b, Hin, Win, M, Hf, Wf, stride, stride,
                                                 pad, pad)
   dout = l2_loss::backward(out, y)
   [dX, dW, db] = conv2d_depthwise::backward(dout, Hout, Wout, X, W, b, Hin, Win, M, Hf, Wf,
                                             stride, stride, pad, pad)

   # Grad check
   h = 1e-5
   print(" - Grad checking X.")
   for (i in 1:nrow(X)) {
     for (j in 1:ncol(X)) {
       # Compute numerical derivative
       old = as.scalar(X[i,j])
       X[i,j] = old - h
       [outmh, Hout, Wout] = conv2d_depthwise::forward(X, W, b, Hin, Win, M, Hf, Wf, stride, stride,
                                                       pad, pad)
       lossmh = l2_loss::forward(outmh, y)
       X[i,j] = old + h
       [outph, Hout, Wout] = conv2d_depthwise::forward(X, W, b, Hin, Win, M, Hf, Wf, stride, stride,
                                                       pad, pad)
       lossph = l2_loss::forward(outph, y)
       X[i,j] = old  # reset
       dX_num = (lossph-lossmh) / (2*h)  # numerical derivative

       # Check error
       rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
     }
   }

   print(" - Grad checking W.")
   for (i in 1:nrow(W)) {
     for (j in 1:ncol(W)) {
       # Compute numerical derivative
       old = as.scalar(W[i,j])
       W[i,j] = old - h
       [outmh, Hout, Wout] = conv2d_depthwise::forward(X, W, b, Hin, Win, M, Hf, Wf, stride, stride,
                                                       pad, pad)
       lossmh = l2_loss::forward(outmh, y)
       W[i,j] = old + h
       [outph, Hout, Wout] = conv2d_depthwise::forward(X, W, b, Hin, Win, M, Hf, Wf, stride, stride,
                                                       pad, pad)
       lossph = l2_loss::forward(outph, y)
       W[i,j] = old  # reset
       dW_num = (lossph-lossmh) / (2*h)  # numerical derivative

       # Check error
       rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
     }
   }

   print(" - Grad checking b.")
   for (i in 1:nrow(b)) {
     for (j in 1:ncol(b)) {
       # Compute numerical derivative
       old = as.scalar(b[i,j])
       b[i,j] = old - h
       [outmh, Hout, Wout] = conv2d_depthwise::forward(X, W, b, Hin, Win, M, Hf, Wf, stride, stride,
                                                       pad, pad)
       lossmh = l2_loss::forward(outmh, y)
       b[i,j] = old + h
       [outph, Hout, Wout] = conv2d_depthwise::forward(X, W, b, Hin, Win, M, Hf, Wf, stride, stride,
                                                       pad, pad)
       lossph = l2_loss::forward(outph, y)
       b[i,j] = old  # reset
       db_num = (lossph-lossmh) / (2*h)  # numerical derivative

       # Check error
       rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
     }
   }
 }

 conv2d_transpose = function() {
   /*
    * Gradient check for the 2D transpose convolutional layer.
    */
   print("Grad checking the 2D transpose convolutional layer with L2 loss.")

   # Generate data
   N = 2  # num examples
   C = 2  # num channels
   Hin = 3  # input height
   Win = 3  # input width
   F = 2  # num filters
   Hf = 3  # filter height
   Wf = 3  # filter width
   stride = 2
   pad = 1
   out_pad = 1
   X = rand(rows=N, cols=C*Hin*Win)

   # Create layers
   [W, b] = conv2d_transpose::init(F, C, Hf, Wf)

   # Compute analytical gradients of loss wrt parameters
   [out, Hout, Wout] = conv2d_transpose::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
                                                 pad, pad, out_pad, out_pad)
   y = rand(rows=N, cols=F*Hout*Wout)
   dout = l2_loss::backward(out,y)
   [dX, dW, db] = conv2d_transpose::backward(dout, Hout, Wout, X, W, b, C, Hin, Win, Hf, Wf,
                                             stride, stride, pad, pad)

   # Grad check
   h = 1e-5
   print(" - Grad checking X.")
   for (i in 1:nrow(X)) {
     for (j in 1:ncol(X)) {
       # Compute numerical derivative
       old = as.scalar(X[i,j])
       X[i,j] = old - h
       [outmh, Hout, Wout] = conv2d_transpose::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
                                                       pad, pad, out_pad, out_pad)
       lossmh = l2_loss::forward(outmh, y)
       X[i,j] = old + h
       [outph, Hout, Wout] = conv2d_transpose::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
                                                       pad, pad, out_pad, out_pad)
       lossph = l2_loss::forward(outph, y)
       X[i,j] = old  # reset
       dX_num = (lossph-lossmh) / (2*h)  # numerical derivative

       # Check error
       rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
     }
   }

   print(" - Grad checking W.")
   for (i in 1:nrow(W)) {
     for (j in 1:ncol(W)) {
       # Compute numerical derivative
       old = as.scalar(W[i,j])
       W[i,j] = old - h
       [outmh, Hout, Wout] = conv2d_transpose::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
                                                       pad, pad, out_pad, out_pad)
       lossmh = l2_loss::forward(outmh, y)
       W[i,j] = old + h
       [outph, Hout, Wout] = conv2d_transpose::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
                                                       pad, pad, out_pad, out_pad)
       lossph = l2_loss::forward(outph, y)
       W[i,j] = old  # reset
       dW_num = (lossph-lossmh) / (2*h)  # numerical derivative

       # Check error
       rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
     }
   }

   print(" - Grad checking b.")
   for (i in 1:nrow(b)) {
     for (j in 1:ncol(b)) {
       # Compute numerical derivative
       old = as.scalar(b[i,j])
       b[i,j] = old - h
       [outmh, Hout, Wout] = conv2d_transpose::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
                                                       pad, pad, out_pad, out_pad)
       lossmh = l2_loss::forward(outmh, y)
       b[i,j] = old + h
       [outph, Hout, Wout] = conv2d_transpose::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
                                                       pad, pad, out_pad, out_pad)
       lossph = l2_loss::forward(outph, y)
       b[i,j] = old  # reset
       db_num = (lossph-lossmh) / (2*h)  # numerical derivative

       # Check error
       rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
     }
   }
 }

 conv2d_transpose_depthwise = function() {
   /*
    * Gradient check for the 2D depthwise transpose convolutional layer.
    */
   print("Grad checking the 2D depthwise transpose convolutional layer with L2 loss.")

   # Generate data
   N = 2  # num examples
   C = 8  # num channels
   Hin = 3  # input height
   Win = 3  # input width
   M = 4  # depth of filters
   Hf = 3  # filter height
   Wf = 3  # filter width
   stride = 2
   pad = 1
   out_pad = 1
   X = rand(rows=N, cols=C*Hin*Win)

   # Create layers
   [W, b] = conv2d_transpose_depthwise::init(C, M, Hf, Wf)

   # Compute analytical gradients of loss wrt parameters
   [out, Hout, Wout] = conv2d_transpose_depthwise::forward(X, W, b, C, Hin, Win, M, Hf, Wf,
                                                           stride, stride, pad, pad,
                                                           out_pad, out_pad)
   y = rand(rows=N, cols=C/M*Hout*Wout)
   dout = l2_loss::backward(out,y)
   [dX, dW, db] = conv2d_transpose_depthwise::backward(dout, Hout, Wout, X, W, b, C, Hin, Win, M,
                                                       Hf, Wf, stride, stride, pad, pad)

   # Grad check
   h = 1e-5
   print(" - Grad checking X.")
   for (i in 1:nrow(X)) {
     for (j in 1:ncol(X)) {
       # Compute numerical derivative
       old = as.scalar(X[i,j])
       X[i,j] = old - h
       [outmh, Hout, Wout] = conv2d_transpose_depthwise::forward(X, W, b, C, Hin, Win, M, Hf, Wf,
                                                                 stride, stride, pad, pad,
                                                                 out_pad, out_pad)
       lossmh = l2_loss::forward(outmh, y)
       X[i,j] = old + h
       [outph, Hout, Wout] = conv2d_transpose_depthwise::forward(X, W, b, C, Hin, Win, M, Hf, Wf,
                                                                 stride, stride, pad, pad,
                                                                 out_pad, out_pad)
       lossph = l2_loss::forward(outph, y)
       X[i,j] = old  # reset
       dX_num = (lossph-lossmh) / (2*h)  # numerical derivative

       # Check error
       rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
     }
   }

   print(" - Grad checking W.")
   for (i in 1:nrow(W)) {
     for (j in 1:ncol(W)) {
       # Compute numerical derivative
       old = as.scalar(W[i,j])
       W[i,j] = old - h
       [outmh, Hout, Wout] = conv2d_transpose_depthwise::forward(X, W, b, C, Hin, Win, M, Hf, Wf,
                                                                 stride, stride, pad, pad,
                                                                 out_pad, out_pad)
       lossmh = l2_loss::forward(outmh, y)
       W[i,j] = old + h
       [outph, Hout, Wout] = conv2d_transpose_depthwise::forward(X, W, b, C, Hin, Win, M, Hf, Wf,
                                                                 stride, stride, pad, pad,
                                                                 out_pad, out_pad)
       lossph = l2_loss::forward(outph, y)
       W[i,j] = old  # reset
       dW_num = (lossph-lossmh) / (2*h)  # numerical derivative

       # Check error
       rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
     }
   }

   print(" - Grad checking b.")
   for (i in 1:nrow(b)) {
     for (j in 1:ncol(b)) {
       # Compute numerical derivative
       old = as.scalar(b[i,j])
       b[i,j] = old - h
       [outmh, Hout, Wout] = conv2d_transpose_depthwise::forward(X, W, b, C, Hin, Win, M, Hf, Wf,
                                                                 stride, stride, pad, pad,
                                                                 out_pad, out_pad)
       lossmh = l2_loss::forward(outmh, y)
       b[i,j] = old + h
       [outph, Hout, Wout] = conv2d_transpose_depthwise::forward(X, W, b, C, Hin, Win, M, Hf, Wf,
                                                                 stride, stride, pad, pad,
                                                                 out_pad, out_pad)
       lossph = l2_loss::forward(outph, y)
       b[i,j] = old  # reset
       db_num = (lossph-lossmh) / (2*h)  # numerical derivative

       # Check error
       rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
     }
   }
 }

 cross_entropy_loss = function() {
   /*
    * Gradient check for the cross-entropy loss function.
    */
   print("Grad checking the cross-entropy loss function.")

   # Generate data
   N = 3  # num examples
   K = 10  # num targets
   pred = rand(rows=N, cols=K, min=0, max=1, pdf="uniform")
   pred = softmax::forward(pred)  # normalized probs
   y = rand(rows=N, cols=K, min=0, max=1, pdf="uniform")
   y = softmax::forward(y)  # normalized probs

   # Compute analytical gradient
   dpred = cross_entropy_loss::backward(pred, y)

   # Grad check
   h = 1e-5
   for (i in 1:nrow(pred)) {
     for (j in 1:ncol(pred)) {
       # Compute numerical derivative
       old = as.scalar(pred[i,j])
       pred[i,j] = old - h
       lossmh = cross_entropy_loss::forward(pred, y)
       pred[i,j] = old + h
       lossph = cross_entropy_loss::forward(pred, y)
       pred[i,j] = old  # reset W[i,j]
       dpred_num = (lossph-lossmh) / (2*h)  # numerical derivative

       # Check error
       rel_error = test_util::check_rel_grad_error(as.scalar(dpred[i,j]), dpred_num, lossph, lossmh)
     }
   }
 }

 cross_entropy_loss2d = function() {
   /*
    * Gradient check for the 2D cross-entropy loss function.
    */
   print("Grad checking the 2D cross-entropy loss function.")

   # Generate data
   N = 3  # num examples
   C = 10  # num targets
   Hin = 5  # example height
   Win = 5  # example width
   pred = rand(rows=N, cols=C*Hin*Win, min=0, max=1, pdf="uniform")
   pred = softmax2d::forward(pred, C)  # normalized probs

   y = rand(rows=N, cols=C*Hin*Win, min=0, max=1, pdf="uniform")
   y = softmax2d::forward(y, C)  # normalized probs

   # Compute analytical gradient
   dpred = cross_entropy_loss2d::backward(pred, y, C)

   # Grad check
   h = 1e-6
   for (i in 1:nrow(pred)) {
     for (j in 1:ncol(pred)) {
       # Compute numerical derivative
       old = as.scalar(pred[i,j])
       pred[i,j] = old - h
       lossmh = cross_entropy_loss2d::forward(pred, y, C)
       pred[i,j] = old + h
       lossph = cross_entropy_loss2d::forward(pred, y, C)
       pred[i,j] = old  # reset W[i,j]
       dpred_num = (lossph-lossmh) / (2*h)  # numerical derivative

       # Check error
       rel_error = test_util::check_rel_grad_error(as.scalar(dpred[i,j]), dpred_num, lossph, lossmh)
     }
   }
 }

 dropout = function() {
   /*
    * Gradient check for the (inverted) dropout layer.
    */
   print("Grad checking the (inverted) dropout layer with L2 loss.")

   # Generate data
   N = 3  # num examples
   M = 100  # num neurons
   p = 0.5  # probability of dropping neuron output
   seed = as.integer(floor(as.scalar(rand(rows=1, cols=1, min=1, max=100000))))  # random seed
   X = rand(rows=N, cols=M)
   y = rand(rows=N, cols=M)

   # Compute analytical gradients of loss wrt parameters
   [out, mask] = dropout::forward(X, p, seed)
   dout = l2_loss::backward(out, y)
   dX = dropout::backward(dout, X, p, mask)

   # Grad check
   h = 1e-5
   for (i in 1:nrow(X)) {
     for (j in 1:ncol(X)) {
       # Compute numerical derivative
       old = as.scalar(X[i,j])
       X[i,j] = old - h
       [outmh, mask] = dropout::forward(X, p, seed)
       lossmh = l2_loss::forward(outmh, y)
       X[i,j] = old + h
       [outph, mask] = dropout::forward(X, p, seed)
       lossph = l2_loss::forward(outph, y)
       X[i,j] = old  # reset
       dX_num = (lossph-lossmh) / (2*h)  # numerical derivative

       # Check error
       rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
     }
   }
 }

 fm = function() {
   /*
    * Gradient check for the factorization machines.
    */
   print("Grad checking the factorization machines with L2 loss.")

   # Generate data
   n = 5# num examples
   d = 100 # num features
   k = 2 # factorization dimensionality
   X = rand(rows=n, cols=d)
   y = rand(rows=n, cols=1)
   [w0, W, V] = fm::init(d, k)

   # Compute analytical gradients of loss wrt parameters
   out = fm::forward(X, w0, W, V)
   dout = l2_loss::backward(out, y)
   [dw0, dW, dV] = fm::backward(dout, X, w0, W, V)

   # Grad check
   h = 1e-5

   print(" - Grad checking w0.")
   for (i in 1:nrow(w0)) {
     for (j in 1:ncol(w0)) {
       # Compute numerical derivative
       old = as.scalar(w0[i,j])
       w0[i,j] = old - h  # h = 1e-5
       outmh = fm::forward(X, w0, W, V)
       lossmh = l2_loss::forward(outmh, y)
       w0[i,j] = old + h  # h = 1e-5
       outph = fm::forward(X, w0, W, V)
       lossph = l2_loss::forward(outph, y)
       w0[i,j] = old  # reset
       dw0_num = (lossph-lossmh) / (2*h) # numerical derivative

       # Check error
       rel_error = test_util::check_rel_grad_error(as.scalar(dw0[i,j]), dw0_num, lossph, lossmh)
     }
   }

   print(" - Grad checking W.")
   for (i in 1:nrow(W)) {
     for (j in 1:ncol(W)) {
       # Compute numerical derivative
       old = as.scalar(W[i,j])
       W[i,j] = old - h  # h = 1e-5
       outmh = fm::forward(X, w0, W, V)
       lossmh = l2_loss::forward(outmh, y)
       W[i,j] = old + h  # h = 1e-5
       outph = fm::forward(X, w0, W, V)
       lossph = l2_loss::forward(outph, y)
       W[i,j] = old  # reset
       dW_num = (lossph-lossmh) / (2*h) # numerical derivative

       # Check error
       rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
     }
   }

   print(" - Grad checking V.")
   for (i in 1:nrow(V)) {
     for(i in 1:ncol(V)) {
       # Compute numerical derivative
       old = as.scalar(V[i,j])
       V[i,j] = old - h  # h = 1e-5
       outmh = fm::forward(X, w0, W, V)
       lossmh = l2_loss::forward(outmh, y)
       V[i,j] = old + h  # h= 1e-5
       outph = fm::forward(X, w0, W, V)
       lossph = l2_loss::forward(outph, y)
       V[i,j] = old  # reset
       dV_num = (lossph-lossmh) / (2*h) # numerical derivative

       # Check error
       rel_error = test_util::check_rel_grad_error(as.scalar(dV[i,j]), dV_num, lossph, lossmh)
     }
   }
 }

 l1_loss = function() {
   /*
    * Gradient check for the L1 loss function.
    */
   print("Grad checking the L1 loss function.")

   # Generate data
   N = 3 # num examples
   D = 2 # num targets
   pred = rand(rows=N, cols=D)
   y = rand(rows=N, cols=D)

   # Compute analytical gradient
   dpred = l1_loss::backward(pred, y)

   # Grad check
   h = 1e-5
   for (i in 1:nrow(pred)) {
     for (j in 1:ncol(pred)) {
       # Compute numerical derivative
       old = as.scalar(pred[i,j])
       pred[i,j] = old - h
       lossmh = l1_loss::forward(pred, y)
       pred[i,j] = old + h
       lossph = l1_loss::forward(pred, y)
       pred[i,j] = old  # reset W[i,j]
       dpred_num = (lossph-lossmh) / (2*h)  # numerical derivative

       # Check error
       rel_error = test_util::check_rel_grad_error(as.scalar(dpred[i,j]), dpred_num, lossph, lossmh)
     }
   }
 }

 l1_reg = function() {
   /*
    * Gradient check for the L1 regularization function.
    */
   print("Grad checking the L1 regularization function.")

   # Generate data
   D = 5 # num features
   M = 3 # num neurons
   lambda = 0.01
   W = rand(rows=D, cols=M)

   # Compute analytical gradient
   dW = l1_reg::backward(W, lambda)

   # Grad check
   h = 1e-5
   for (i in 1:nrow(W)) {
     for (j in 1:ncol(W)) {
       # Compute numerical derivative
       old = as.scalar(W[i,j])
       W[i,j] = old - h
       reg_lossmh = l1_reg::forward(W, lambda)
       W[i,j] = old + h
       reg_lossph = l1_reg::forward(W, lambda)
       W[i,j] = old  # reset W[i,j]
       dW_num = (reg_lossph-reg_lossmh) / (2*h)  # numerical derivative

       # Check error
       rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num,
                                                   reg_lossph, reg_lossmh)
     }
   }
 }

 l2_loss = function() {
   /*
    * Gradient check for the L2 loss function.
    */
   print("Grad checking the L2 loss function.")

   # Generate data
   N = 3 # num examples
   D = 2 # num targets
   pred = rand(rows=N, cols=D)
   y = rand(rows=N, cols=D)

   # Compute analytical gradient
   dpred = l2_loss::backward(pred, y)

   # Grad check
   h = 1e-5
   for (i in 1:nrow(pred)) {
     for (j in 1:ncol(pred)) {
       # Compute numerical derivative
       old = as.scalar(pred[i,j])
       pred[i,j] = old - h
       lossmh = l2_loss::forward(pred, y)
       pred[i,j] = old + h
       lossph = l2_loss::forward(pred, y)
       pred[i,j] = old  # reset W[i,j]
       dpred_num = (lossph-lossmh) / (2*h)  # numerical derivative

       # Check error
       rel_error = test_util::check_rel_grad_error(as.scalar(dpred[i,j]), dpred_num, lossph, lossmh)
     }
   }
 }

 l2_reg = function() {
   /*
    * Gradient check for the L2 regularization function.
    */
   print("Grad checking the L2 regularization function.")

   # Generate data
   D = 5 # num features
   M = 3 # num neurons
   lambda = 0.01
   W = rand(rows=D, cols=M)

   # Compute analytical gradient
   dW = l2_reg::backward(W, lambda)

   # Grad check
   h = 1e-5
   for (i in 1:nrow(W)) {
     for (j in 1:ncol(W)) {
       # Compute numerical derivative
       old = as.scalar(W[i,j])
       W[i,j] = old - h
       reg_lossmh = l2_reg::forward(W, lambda)
       W[i,j] = old + h
       reg_lossph = l2_reg::forward(W, lambda)
       W[i,j] = old  # reset W[i,j]
       dW_num = (reg_lossph-reg_lossmh) / (2*h)  # numerical derivative

       # Check error
       rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num,
                                                   reg_lossph, reg_lossmh)
     }
   }
 }

 log_loss = function() {
   /*
    * Gradient check for the log loss function.
    */
   print("Grad checking the log loss function.")

   # Generate data
   N = 20 # num examples
   D = 1 # num targets
   pred = rand(rows=N, cols=D, min=0, max=1, pdf="uniform")
   y = round(rand(rows=N, cols=D, min=0, max=1, pdf="uniform"))

   # Compute analytical gradient
   dpred = log_loss::backward(pred, y)

   # Grad check
   h = 1e-5
   for (i in 1:nrow(pred)) {
     for (j in 1:ncol(pred)) {
       # Compute numerical derivative
       old = as.scalar(pred[i,j])
       pred[i,j] = old - h
       lossmh = log_loss::forward(pred, y)
       pred[i,j] = old + h
       lossph = log_loss::forward(pred, y)
       pred[i,j] = old  # reset W[i,j]
       dpred_num = (lossph-lossmh) / (2*h)  # numerical derivative

       # Check error
       rel_error = test_util::check_rel_grad_error(as.scalar(dpred[i,j]), dpred_num, lossph, lossmh)
     }
   }
 }

 lstm = function() {
   /*
    * Gradient check for the LSTM layer.
    */
   print("Grad checking the LSTM layer with L2 loss.")

   # Generate data
   N = 3  # num examples
   D = 5  # num features
   T = 15 # num timesteps (sequence length)
   M = 10 # num neurons
   X = rand(rows=N, cols=T*D)
   yc = rand(rows=N, cols=M)
   out0 = rand(rows=N, cols=M)
   c0 = rand(rows=N, cols=M)
   [W, b, dummy, dummy2] = lstm::init(N, D, M)

   # test with (1) outputs from all timesteps, and (2) output from the final timestep
   for (i in 1:2) {
     if (i == 1) {
       return_seq = TRUE
       y = rand(rows=N, cols=T*M)
     }
     else {
       return_seq = FALSE
       y = rand(rows=N, cols=M)
     }

     print(" - Grad checking with return_seq = " + return_seq)

     # Compute analytical gradients of loss wrt parameters
     [out, c, cache_out, cache_c, cache_ifog] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
     dout = l2_loss::backward(out, y)
     dc = l2_loss::backward(c, yc)
     [dX, dW, db, dout0, dc0] = lstm::backward(dout, dc, X, W, b, T, D, return_seq, out0, c0,
                                               cache_out, cache_c, cache_ifog)

     # Grad check
     h = 1e-5
     print("   - Grad checking X.")
     for (i in 1:nrow(X)) {
       for (j in 1:ncol(X)) {
         # Compute numerical derivative
         old = as.scalar(X[i,j])
         X[i,j] = old - h
         [outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
         loss_outmh = l2_loss::forward(outmh, y)
         loss_cmh = l2_loss::forward(cmh, yc)
         lossmh = loss_outmh + loss_cmh
         X[i,j] = old + h
         [outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
         loss_outph = l2_loss::forward(outph, y)
         loss_cph = l2_loss::forward(cph, yc)
         lossph = loss_outph + loss_cph
         X[i,j] = old  # reset
         dX_num = (lossph-lossmh) / (2*h)  # numerical derivative

         # Check error
         rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
       }
     }

     print("   - Grad checking W.")
     for (i in 1:nrow(W)) {
       for (j in 1:ncol(W)) {
         # Compute numerical derivative
         old = as.scalar(W[i,j])
         W[i,j] = old - h
         [outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
         loss_outmh = l2_loss::forward(outmh, y)
         loss_cmh = l2_loss::forward(cmh, yc)
         lossmh = loss_outmh + loss_cmh
         W[i,j] = old + h
         [outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
         loss_outph = l2_loss::forward(outph, y)
         loss_cph = l2_loss::forward(cph, yc)
         lossph = loss_outph + loss_cph
         W[i,j] = old  # reset
         dW_num = (lossph-lossmh) / (2*h)  # numerical derivative

         # Check error
         rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
       }
     }

     print("   - Grad checking b.")
     for (i in 1:nrow(b)) {
       for (j in 1:ncol(b)) {
         # Compute numerical derivative
         old = as.scalar(b[i,j])
         b[i,j] = old - h
         [outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
         loss_outmh = l2_loss::forward(outmh, y)
         loss_cmh = l2_loss::forward(cmh, yc)
         lossmh = loss_outmh + loss_cmh
         b[i,j] = old + h
         [outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
         loss_outph = l2_loss::forward(outph, y)
         loss_cph = l2_loss::forward(cph, yc)
         lossph = loss_outph + loss_cph
         b[i,j] = old  # reset
         db_num = (lossph-lossmh) / (2*h)  # numerical derivative

         # Check error
         rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
       }
     }

     print("   - Grad checking out0.")
     for (i in 1:nrow(out0)) {
       for (j in 1:ncol(out0)) {
         # Compute numerical derivative
         old = as.scalar(out0[i,j])
         out0[i,j] = old - h
         [outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
         loss_outmh = l2_loss::forward(outmh, y)
         loss_cmh = l2_loss::forward(cmh, yc)
         lossmh = loss_outmh + loss_cmh
         out0[i,j] = old + h
         [outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
         loss_outph = l2_loss::forward(outph, y)
         loss_cph = l2_loss::forward(cph, yc)
         lossph = loss_outph + loss_cph
         out0[i,j] = old  # reset
         dout0_num = (lossph-lossmh) / (2*h)  # numerical derivative

         # Check error
         rel_error = test_util::check_rel_grad_error(as.scalar(dout0[i,j]), dout0_num, lossph, lossmh)
       }
     }

     print("   - Grad checking c0.")
     for (i in 1:nrow(c0)) {
       for (j in 1:ncol(c0)) {
         # Compute numerical derivative
         old = as.scalar(c0[i,j])
         c0[i,j] = old - h
         [outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
         loss_outmh = l2_loss::forward(outmh, y)
         loss_cmh = l2_loss::forward(cmh, yc)
         lossmh = loss_outmh + loss_cmh
         c0[i,j] = old + h
         [outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
         loss_outph = l2_loss::forward(outph, y)
         loss_cph = l2_loss::forward(cph, yc)
         lossph = loss_outph + loss_cph
         c0[i,j] = old  # reset
         dc0_num = (lossph-lossmh) / (2*h)  # numerical derivative

         # Check error
         rel_error = test_util::check_rel_grad_error(as.scalar(dc0[i,j]), dc0_num, lossph, lossmh)
       }
     }
   }
 }

 max_pool2d = function() {
   /*
    * Gradient check for the 2D max pooling layer.
    */
   print("Grad checking the 2D max pooling layer with L2 loss.")

   # Generate data
   N = 2  # num examples
   C = 2  # num channels
   Hin = 4  # input height
   Win = 4  # input width
   Hf = 2  # pool filter height
   Wf = 2  # pool filter width
   stride = 2
   X = rand(rows=N, cols=C*Hin*Win)

   for (pad in 0:1) {
     print(" - Grad checking w/ pad="+pad+".")
     Hout = as.integer(floor((Hin + 2*pad - Hf)/stride + 1))
     Wout = as.integer(floor((Win + 2*pad - Wf)/stride + 1))
     y = rand(rows=N, cols=C*Hout*Wout)

     # Compute analytical gradients of loss wrt parameters
     [out, Hout, Wout] = max_pool2d::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
     dout = l2_loss::backward(out, y)
     dX = max_pool2d::backward(dout, Hout, Wout, X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)

     # Grad check
     h = 1e-5
     for (i in 1:nrow(X)) {
       for (j in 1:ncol(X)) {
         # Compute numerical derivative
         old = as.scalar(X[i,j])
         X[i,j] = old - h
         [outmh, Hout, Wout] = max_pool2d::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
         lossmh = l2_loss::forward(outmh, y)
         X[i,j] = old + h
         [outph, Hout, Wout] = max_pool2d::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
         lossph = l2_loss::forward(outph, y)
         X[i,j] = old  # reset
         dX_num = (lossph-lossmh) / (2*h)  # numerical derivative

         # Check error
         rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
       }
     }
   }
 }

 max_pool2d_builtin = function() {
   /*
    * Gradient check for the 2D max pooling layer.
    */
   print("Grad checking the built-in 2D max pooling layer with L2 loss.")

   # Generate data
   N = 2  # num examples
   C = 2  # num channels
   Hin = 4  # input height
   Win = 4  # input width
   Hf = 2  # pool filter height
   Wf = 2  # pool filter width
   stride = 2
   X = rand(rows=N, cols=C*Hin*Win)

   for (pad in 0:1) {
     print(" - Grad checking w/ pad="+pad+".")
     Hout = as.integer(floor((Hin + 2 * pad - Hf) / stride + 1))
     Wout = as.integer(floor((Win + 2 * pad - Wf) / stride + 1))
     y = rand(rows=N, cols=C*Hout*Wout)

     # Compute analytical gradients of loss wrt parameters
     [out, Hout, Wout] = max_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
                                                     pad, pad)
     dout = l2_loss::backward(out, y)
     dX = max_pool2d_builtin::backward(dout, Hout, Wout, X, C, Hin, Win, Hf, Wf, stride, stride,
                                       pad, pad)

     # Grad check
     h = 1e-5
     for (i in 1:nrow(X)) {
       for (j in 1:ncol(X)) {
         # Compute numerical derivative
         old = as.scalar(X[i,j])
         X[i,j] = old - h
         [outmh, Hout, Wout] = max_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
                                                           pad, pad)
         lossmh = l2_loss::forward(outmh, y)
         X[i,j] = old + h
         [outph, Hout, Wout] = max_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
                                                           pad, pad)
         lossph = l2_loss::forward(outph, y)
         X[i,j] = old  # reset
         dX_num = (lossph-lossmh) / (2*h)  # numerical derivative

         # Check error
         rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
       }
     }
   }
 }

 avg_pool2d_builtin = function() {
   /*
    * Gradient check for the 2D avg pooling layer.
    */
   print("Grad checking the built-in 2D avg pooling layer with L2 loss.")

   # Generate data
   N = 2  # num examples
   C = 2  # num channels
   Hin = 4  # input height
   Win = 4  # input width
   Hf = 2  # pool filter height
   Wf = 2  # pool filter width
   stride = 2
   X = rand(rows=N, cols=C*Hin*Win)

   for (pad in 0:1) {
     print(" - Grad checking w/ pad="+pad+".")
     Hout = as.integer(floor((Hin + 2 * pad - Hf) / stride + 1))
     Wout = as.integer(floor((Win + 2 * pad - Wf) / stride + 1))
     y = rand(rows=N, cols=C*Hout*Wout)

     # Compute analytical gradients of loss wrt parameters
     [out, Hout, Wout] = avg_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
                                                     pad, pad)
     dout = l2_loss::backward(out, y)
     dX = avg_pool2d_builtin::backward(dout, Hout, Wout, X, C, Hin, Win, Hf, Wf, stride, stride,
                                       pad, pad)

     # Grad check
     h = 1e-5
     for (i in 1:nrow(X)) {
       for (j in 1:ncol(X)) {
         # Compute numerical derivative
         old = as.scalar(X[i,j])
         X[i,j] = old - h
         [outmh, Hout, Wout] = avg_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
                                                           pad, pad)
         lossmh = l2_loss::forward(outmh, y)
         X[i,j] = old + h
         [outph, Hout, Wout] = avg_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
                                                           pad, pad)
         lossph = l2_loss::forward(outph, y)
         X[i,j] = old  # reset
         dX_num = (lossph-lossmh) / (2*h)  # numerical derivative

         # Check error
         rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
       }
     }
   }
 }


 max_pool2d_simple = function() {
   /*
    * Gradient check for the simple reference 2D max pooling layer.
    */
   print("Grad checking the simple reference 2D max pooling layer with L2 loss.")

   # Generate data
   N = 2  # num examples
   C = 2  # num channels
   Hin = 4  # input height
   Win = 4  # input width
   Hf = 2  # pool filter height
   Wf = 2  # pool filter width
   stride = 2
   X = rand(rows=N, cols=C*Hin*Win)

   for (pad in 0:1) {
     print(" - Grad checking w/ pad="+pad+".")
     Hout = as.integer(floor((Hin + 2*pad - Hf)/stride + 1))
     Wout = as.integer(floor((Win + 2*pad - Wf)/stride + 1))
     y = rand(rows=N, cols=C*Hout*Wout)

     # Compute analytical gradients of loss wrt parameters
     [out, Hout, Wout] = max_pool2d_simple::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
     dout = l2_loss::backward(out, y)
     dX = max_pool2d_simple::backward(dout, Hout, Wout, X, C, Hin, Win, Hf, Wf, stride, stride,
                                      pad, pad)

     # Grad check
     h = 1e-5
     for (i in 1:nrow(X)) {
       for (j in 1:ncol(X)) {
         # Compute numerical derivative
         old = as.scalar(X[i,j])
         X[i,j] = old - h
         [outmh, Hout, Wout] = max_pool2d_simple::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
                                                          pad, pad)
         lossmh = l2_loss::forward(outmh, y)
         X[i,j] = old + h
         [outph, Hout, Wout] = max_pool2d_simple::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
                                                          pad, pad)
         lossph = l2_loss::forward(outph, y)
         X[i,j] = old  # reset
         dX_num = (lossph-lossmh) / (2*h)  # numerical derivative

         # Check error
         rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
       }
     }
   }
 }

 upsample2d = function() {
   print("Grad checking the upsample2d layer with L2 loss.")

   C=2; Hin=3; Win=3; size_h=2; size_w=2
   # Generate data
   N = 3 # num examples
   M = C*Hin*Win # num neurons
   X = rand(rows=N, cols=M, min=-5, max=5)
   y = rand(rows=N, cols=M*size_h*size_w)

   # Compute analytical gradients of loss wrt parameters
   out = upsample2d::forward(X, C, Hin, Win, size_h, size_w)
   dout = l2_loss::backward(out, y)
   dX = upsample2d::backward(dout, C, Hin, Win, size_h, size_w)

   # Grad check
   h = 1e-5
   for (i in 1:nrow(X)) {
     for (j in 1:ncol(X)) {
       # Compute numerical derivative
       old = as.scalar(X[i,j])
       X[i,j] = old - h
       outmh = upsample2d::forward(X, C, Hin, Win, size_h, size_w)
       lossmh = l2_loss::forward(outmh, y)
       X[i,j] = old + h
       outph = upsample2d::forward(X, C, Hin, Win, size_h, size_w)
       lossph = l2_loss::forward(outph, y)
       X[i,j] = old  # reset
       dX_num = (lossph-lossmh) / (2*h)  # numerical derivative

       # Check error
       rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
     }
   }
 }

 relu = function() {
   /*
    * Gradient check for the ReLU nonlinearity layer.
    *
    * NOTE: This could result in a false-negative in which the test
    * fails due to a kink being crossed in the nonlinearity.  This
    * occurs when the tests, f(x-h) and f(x+h), end up on opposite
    * sides of the zero threshold of max(0, fx).  For now, just run
    * the tests again.  In the future, we can explicitly check for
    * this and rerun the test automatically.
    */
   print("Grad checking the ReLU nonlinearity layer with L2 loss.")

   # Generate data
   N = 3 # num examples
   M = 10 # num neurons
   X = rand(rows=N, cols=M, min=-5, max=5)
   y = rand(rows=N, cols=M)

   # Compute analytical gradients of loss wrt parameters
   out = relu::forward(X)
   dout = l2_loss::backward(out, y)
   dX = relu::backward(dout, X)

   # Grad check
   h = 1e-5
   for (i in 1:nrow(X)) {
     for (j in 1:ncol(X)) {
       # Compute numerical derivative
       old = as.scalar(X[i,j])
       X[i,j] = old - h
       outmh = relu::forward(X)
       lossmh = l2_loss::forward(outmh, y)
       X[i,j] = old + h
       outph = relu::forward(X)
       lossph = l2_loss::forward(outph, y)
       X[i,j] = old  # reset
       dX_num = (lossph-lossmh) / (2*h)  # numerical derivative

       # Check error
       rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
     }
   }
 }

 leaky_relu = function() {
   /*
   * Gradient check for the ReLU nonlinearity layer.
   *
   * NOTE: This could result in a false-negative in which the test
   * fails due to a kink being crossed in the nonlinearity.  This
   * occurs when the tests, f(x-h) and f(x+h), end up on opposite
   * sides of the zero threshold of max(0, fx).  For now, just run
   * the tests again.  In the future, we can explicitly check for
   * this and rerun the test automatically.
   */
   print("Grad checking the Leaky ReLU nonlinearity layer with L2 loss.")

   # Generate data
   N = 3 # num examples
   M = 10 # num neurons
   X = rand(rows=N, cols=M, min=-5, max=5)
   y = rand(rows=N, cols=M)
   # Compute analytical gradients of loss wrt parameters
   out = leaky_relu::forward(X)
   dout = l2_loss::backward(out, y)
   dX = leaky_relu::backward(dout, X)

   # Grad check
   h = 1e-5
   for (i in 1:nrow(X)) {
     for (j in 1:ncol(X)) {
       # Compute numerical derivative
       old = as.scalar(X[i,j])
       X[i,j] = old - h
       outmh = leaky_relu::forward(X)
       lossmh = l2_loss::forward(outmh, y)
       X[i,j] = old + h
       outph = leaky_relu::forward(X)
       lossph = l2_loss::forward(outph, y)
       X[i,j] = old  # reset
       dX_num = (lossph-lossmh) / (2*h)  # numerical derivative

       # Check error
       rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
     }
   }
 }

 rnn = function() {
   /*
    * Gradient check for the simple RNN layer.
    */
   print("Grad checking the simple RNN layer with L2 loss.")

   # Generate data
   N = 3  # num examples
   D = 5  # num features
   T = 15 # num timesteps (sequence length)
   M = 10 # num neurons
   X = rand(rows=N, cols=T*D)
   out0 = rand(rows=N, cols=M)
   [W, b, dummy] = rnn::init(N, D, M)

   # test with (1) outputs from all timesteps, and (2) output from the final timestep
   for (i in 1:2) {
     if (i == 1) {
       return_seq = TRUE
       y = rand(rows=N, cols=T*M)
     }
     else {
       return_seq = FALSE
       y = rand(rows=N, cols=M)
     }

     print(" - Grad checking with return_seq = " + return_seq)

     # Compute analytical gradients of loss wrt parameters
     [out, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
     dout = l2_loss::backward(out, y)
     [dX, dW, db, dout0] = rnn::backward(dout, X, W, b, T, D, return_seq, out0, cache_out)

     # Grad check
     h = 1e-5
     print("   - Grad checking X.")
     for (i in 1:nrow(X)) {
       for (j in 1:ncol(X)) {
         # Compute numerical derivative
         old = as.scalar(X[i,j])
         X[i,j] = old - h
         [outmh, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
         lossmh = l2_loss::forward(outmh, y)
         X[i,j] = old + h
         [outph, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
         lossph = l2_loss::forward(outph, y)
         X[i,j] = old  # reset
         dX_num = (lossph-lossmh) / (2*h)  # numerical derivative

         # Check error
         rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
       }
     }

     print("   - Grad checking W.")
     for (i in 1:nrow(W)) {
       for (j in 1:ncol(W)) {
         # Compute numerical derivative
         old = as.scalar(W[i,j])
         W[i,j] = old - h
         [outmh, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
         lossmh = l2_loss::forward(outmh, y)
         W[i,j] = old + h
         [outph, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
         lossph = l2_loss::forward(outph, y)
         W[i,j] = old  # reset
         dW_num = (lossph-lossmh) / (2*h)  # numerical derivative

         # Check error
         rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
       }
     }

     print("   - Grad checking b.")
     for (i in 1:nrow(b)) {
       for (j in 1:ncol(b)) {
         # Compute numerical derivative
         old = as.scalar(b[i,j])
         b[i,j] = old - h
         [outmh, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
         lossmh = l2_loss::forward(outmh, y)
         b[i,j] = old + h
         [outph, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
         lossph = l2_loss::forward(outph, y)
         b[i,j] = old  # reset
         db_num = (lossph-lossmh) / (2*h)  # numerical derivative

         # Check error
         rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
       }
     }

     print("   - Grad checking out0.")
     for (i in 1:nrow(out0)) {
       for (j in 1:ncol(out0)) {
         # Compute numerical derivative
         old = as.scalar(out0[i,j])
         out0[i,j] = old - h
         [outmh, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
         lossmh = l2_loss::forward(outmh, y)
         out0[i,j] = old + h
         [outph, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
         lossph = l2_loss::forward(outph, y)
         out0[i,j] = old  # reset
         dout0_num = (lossph-lossmh) / (2*h)  # numerical derivative

         # Check error
         rel_error = test_util::check_rel_grad_error(as.scalar(dout0[i,j]), dout0_num, lossph, lossmh)
       }
     }
   }
 }

 scale_shift1d = function() {
   /*
    * Gradient check for the 1D scale & shift layer.
    */
   print("Grad checking the 1D scale & shift layer with L2 loss.")

   # Generate data
   N = 3 # num examples
   D = 100 # num features
   X = rand(rows=N, cols=D)
   y = rand(rows=N, cols=D)
   [gamma, beta] = scale_shift1d::init(D)

   # Compute analytical gradients of loss wrt parameters
   out = scale_shift1d::forward(X, gamma, beta)
   dout = l2_loss::backward(out, y)
   [dX, dgamma, dbeta] = scale_shift1d::backward(dout, out, X, gamma, beta)

   # Grad check
   h = 1e-5
   print(" - Grad checking X.")
   for (i in 1:nrow(X)) {
     for (j in 1:ncol(X)) {
       # Compute numerical derivative
       old = as.scalar(X[i,j])
       X[i,j] = old - h
       outmh = scale_shift1d::forward(X, gamma, beta)
       lossmh = l2_loss::forward(outmh, y)
       X[i,j] = old + h
       outph = scale_shift1d::forward(X, gamma, beta)
       lossph = l2_loss::forward(outph, y)
       X[i,j] = old  # reset
       dX_num = (lossph-lossmh) / (2*h)  # numerical derivative

       # Check error
       rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
     }
   }

   print(" - Grad checking gamma.")
   for (i in 1:nrow(gamma)) {
     for (j in 1:ncol(gamma)) {
       # Compute numerical derivative
       old = as.scalar(gamma[i,j])
       gamma[i,j] = old - h
       outmh = scale_shift1d::forward(X, gamma, beta)
       lossmh = l2_loss::forward(outmh, y)
       gamma[i,j] = old + h
       outph = scale_shift1d::forward(X, gamma, beta)
       lossph = l2_loss::forward(outph, y)
       gamma[i,j] = old  # reset
       dgamma_num = (lossph-lossmh) / (2*h)  # numerical derivative

       # Check error
       rel_error = test_util::check_rel_grad_error(as.scalar(dgamma[i,j]), dgamma_num,
                                                   lossph, lossmh)
     }
   }

   print(" - Grad checking beta.")
   for (i in 1:nrow(beta)) {
     for (j in 1:ncol(beta)) {
       # Compute numerical derivative
       old = as.scalar(beta[i,j])
       beta[i,j] = old - h
       outmh = scale_shift1d::forward(X, gamma, beta)
       lossmh = l2_loss::forward(outmh, y)
       beta[i,j] = old + h
       outph = scale_shift1d::forward(X, gamma, beta)
       lossph = l2_loss::forward(outph, y)
       beta[i,j] = old  # reset
       dbeta_num = (lossph-lossmh) / (2*h)  # numerical derivative

       # Check error
       rel_error = test_util::check_rel_grad_error(as.scalar(dbeta[i,j]), dbeta_num,
                                                   lossph, lossmh)
     }
   }
 }

 scale_shift2d = function() {
   /*
    * Gradient check for the 2D scale & shift layer.
    */
   print("Grad checking the 2D scale & shift layer with L2 loss.")

   # Generate data
   N = 3 # num examples
   C = 2  # num channels
   Hin = 5  # input height
   Win = 5  # input width
   X = rand(rows=N, cols=C*Hin*Win)
   y = rand(rows=N, cols=C*Hin*Win)
   [gamma, beta] = scale_shift2d::init(C)

   # Compute analytical gradients of loss wrt parameters
   out = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
   dout = l2_loss::backward(out, y)
   [dX, dgamma, dbeta] = scale_shift2d::backward(dout, out, X, gamma, beta, C, Hin, Win)

   # Grad check
   h = 1e-5
   print(" - Grad checking X.")
   for (i in 1:nrow(X)) {
     for (j in 1:ncol(X)) {
       # Compute numerical derivative
       old = as.scalar(X[i,j])
       X[i,j] = old - h
       outmh = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
       lossmh = l2_loss::forward(outmh, y)
       X[i,j] = old + h
       outph = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
       lossph = l2_loss::forward(outph, y)
       X[i,j] = old  # reset
       dX_num = (lossph-lossmh) / (2*h)  # numerical derivative

       # Check error
       rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
     }
   }

   print(" - Grad checking gamma.")
   for (i in 1:nrow(gamma)) {
     for (j in 1:ncol(gamma)) {
       # Compute numerical derivative
       old = as.scalar(gamma[i,j])
       gamma[i,j] = old - h
       outmh = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
       lossmh = l2_loss::forward(outmh, y)
       gamma[i,j] = old + h
       outph = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
       lossph = l2_loss::forward(outph, y)
       gamma[i,j] = old  # reset
       dgamma_num = (lossph-lossmh) / (2*h)  # numerical derivative

       # Check error
       rel_error = test_util::check_rel_grad_error(as.scalar(dgamma[i,j]), dgamma_num,
                                                   lossph, lossmh)
     }
   }

   print(" - Grad checking beta.")
   for (i in 1:nrow(beta)) {
     for (j in 1:ncol(beta)) {
       # Compute numerical derivative
       old = as.scalar(beta[i,j])
       beta[i,j] = old - h
       outmh = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
       lossmh = l2_loss::forward(outmh, y)
       beta[i,j] = old + h
       outph = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
       lossph = l2_loss::forward(outph, y)
       beta[i,j] = old  # reset
       dbeta_num = (lossph-lossmh) / (2*h)  # numerical derivative

       # Check error
       rel_error = test_util::check_rel_grad_error(as.scalar(dbeta[i,j]), dbeta_num,
                                                   lossph, lossmh)
     }
   }
 }

 sigmoid = function() {
   /*
    * Gradient check for the sigmoid nonlinearity layer.
    */
   print("Grad checking the sigmoid nonlinearity layer with L2 loss.")

   # Generate data
   N = 3 # num examples
   M = 10 # num neurons
   X = rand(rows=N, cols=M)
   y = rand(rows=N, cols=M)

   # Compute analytical gradients of loss wrt parameters
   out = sigmoid::forward(X)
   dout = l2_loss::backward(out, y)
   dX = sigmoid::backward(dout, X)

   # Grad check
   h = 1e-5
   for (i in 1:nrow(X)) {
     for (j in 1:ncol(X)) {
       # Compute numerical derivative
       old = as.scalar(X[i,j])
       X[i,j] = old - h
       outmh = sigmoid::forward(X)
       lossmh = l2_loss::forward(outmh, y)
       X[i,j] = old + h
       outph = sigmoid::forward(X)
       lossph = l2_loss::forward(outph, y)
       X[i,j] = old  # reset
       dX_num = (lossph-lossmh) / (2*h)  # numerical derivative

       # Check error
       rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
     }
   }
 }

 softmax = function() {
   /*
    * Gradient check for the softmax layer.
    */
   print("Grad checking the softmax layer with L2 loss.")

   # Generate data
   N = 3 # num examples
   D = 10 # num classes
   X = rand(rows=N, cols=D)
   y = rand(rows=N, cols=D, min=0, max=1, pdf="uniform")
   y = y / rowSums(y)

   # Compute analytical gradients of loss wrt parameters
   out = softmax::forward(X)
   dout = l2_loss::backward(out, y)
   dX = softmax::backward(dout, X)

   # Grad check
   h = 1e-5
   for (i in 1:nrow(X)) {
     for (j in 1:ncol(X)) {
       # Compute numerical derivative
       old = as.scalar(X[i,j])
       X[i,j] = old - h
       outmh = softmax::forward(X)
       lossmh = l2_loss::forward(outmh, y)
       X[i,j] = old + h
       outph = softmax::forward(X)
       lossph = l2_loss::forward(outph, y)
       X[i,j] = old  # reset
       dX_num = (lossph-lossmh) / (2*h)  # numerical derivative

       # Check error
       rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
     }
   }
 }

 softmax2d = function() {
   /*
    * Gradient check for the 2D softmax layer.
    */
   print("Grad checking the 2D softmax layer with L2 loss.")

   # Generate data
   N = 3  # num examples
   C = 10  # num classes
   Hin = 5  # example height
   Win = 5  # example width

   X = rand(rows=N, cols=C*Hin*Win)
   y = rand(rows=N, cols=C*Hin*Win, min=0, max=1, pdf="uniform")
   y_C_NHW = util::transpose_NCHW_to_CNHW(y, C)
   y_NHW_C = t(y_C_NHW)
   y_NHW_C = y_NHW_C / rowSums(y_NHW_C)

   # Compute analytical gradients of loss wrt parameters
   out = softmax2d::forward(X, C)
   out_C_NHW = util::transpose_NCHW_to_CNHW(out, C)
   out_NHW_C = t(out_C_NHW)

   dout_NHW_C = l2_loss::backward(out_NHW_C, y_NHW_C)
   dout_C_NHW = t(dout_NHW_C)
   dout = util::transpose_NCHW_to_CNHW(dout_C_NHW, N)
   dX = softmax2d::backward(dout, X, C)

   # Grad check
   h = 1e-5
   for (i in 1:nrow(X)) {
     for (j in 1:ncol(X)) {
       # Compute numerical derivative
       old = as.scalar(X[i,j])
       X[i,j] = old - h
       outmh = softmax2d::forward(X, C)
       outmh_C_NHW = util::transpose_NCHW_to_CNHW(outmh, C)
       outmh_NHW_C = t(outmh_C_NHW)
       lossmh = l2_loss::forward(outmh_NHW_C, y_NHW_C)

       X[i,j] = old + h
       outph = softmax2d::forward(X, C)
       outph_C_NHW = util::transpose_NCHW_to_CNHW(outph, C)
       outph_NHW_C = t(outph_C_NHW)
       lossph = l2_loss::forward(outph_NHW_C, y_NHW_C)
       X[i,j] = old  # reset
       dX_num = (lossph-lossmh) / (2*h)  # numerical derivative

       # Check error
       rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
     }
   }
 }

 tanh = function() {
   /*
    * Gradient check for the hyperbolic tangent (tanh) nonlinearity
    * layer.
    */
   print("Grad checking the tanh nonlinearity layer with L2 loss.")

   # Generate data
   N = 3 # num examples
   M = 10 # num neurons
   X = rand(rows=N, cols=M)
   y = rand(rows=N, cols=M)

   # Compute analytical gradients of loss wrt parameters
   out = tanh::forward(X)
   dout = l2_loss::backward(out, y)
   dX = tanh::backward(dout, X)

   # Grad check
   h = 1e-5
   for (i in 1:nrow(X)) {
     for (j in 1:ncol(X)) {
       # Compute numerical derivative
       old = as.scalar(X[i,j])
       X[i,j] = old - h
       outmh = tanh::forward(X)
       lossmh = l2_loss::forward(outmh, y)
       X[i,j] = old + h
       outph = tanh::forward(X)
       lossph = l2_loss::forward(outph, y)
       X[i,j] = old  # reset
       dX_num = (lossph-lossmh) / (2*h)  # numerical derivative

       # Check error
       rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
     }
   }
 }

 two_layer_affine_l2_net = function() {
   /*
    * Gradient check for a two-layer, fully-connected, feed-forward
    * network with ReLU nonlinearity and L2 loss.
    *
    * NOTE: This could result in a false-negative in which the test
    * fails due to a kink being crossed in the ReLU nonlinearity.  This
    * occurs when the tests, f(x-h) and f(x+h), end up on opposite
    * sides of the zero threshold of max(0, fx).  For now, just run
    * the tests again.  In the future, we can explicitly check for
    * this and rerun the test automatically.
    */
   print("Grad checking a two-layer, fully-connected, feed-forward network with a ReLU " +
         "nonlinearity, and an L2 loss function.")

   # Generate input data
   N = 1000 # num examples
   D = 100 # num features
   yD = 5 # num targets
   X = rand(rows=N, cols=D, pdf="normal")
   y = rand(rows=N, cols=yD)

   # Create 2-layer, fully-connected network
   M = 10 # number of hidden neurons
   [W1, b1] = affine::init(D, M)
   [W2, b2] = affine::init(M, yD)
   W2 = W2 / sqrt(2)  # different initialization, since being fed into l2 loss, instead of relu

   # Optimize for short "burn-in" time to move to characteristic
   # mode of operation and unmask any real issues.
   print(" - Burn-in:")
   lr = 0.01
   decay = 0.99
   for(i in 1:5) {
     # Compute forward and backward passes of net
     [pred, loss, dX, dW1, db1, dW2, db2] = two_layer_affine_l2_net_run(X, y, W1, b1, W2, b2)
     print("   - L2 loss: " + loss)

     # Optimize with basic SGD
     W1 = W1 - lr * dW1
     b1 = b1 - lr * db1
     W2 = W2 - lr * dW2
     b2 = b2 - lr * db2
     lr = lr * decay
   }

   # Compute analytical gradients
   [pred, loss, dX, dW1, db1, dW2, db2] = two_layer_affine_l2_net_run(X, y, W1, b1, W2, b2)

   # Grad check
   h = 1e-6
   print(" - Grad checking X.")
   for (i in 1:2) {
     for (j in 1:ncol(X)) {
       # Compute numerical derivative
       old_x = as.scalar(X[i,j])
       X[i,j] = old_x - h
       [lossmh, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
       X[i,j] = old_x + h
       [lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
       X[i,j] = old_x  # reset X[i,j]
       dX_num = (lossph-lossmh) / (2*h)  # numerical derivative

       # Check error
       rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
     }
   }

   print(" - Grad checking W1.")
   for (i in 1:nrow(W1)) {
     for (j in 1:ncol(W1)) {
       # Compute numerical derivative
       old_w = as.scalar(W1[i,j])
       W1[i,j] = old_w - h
       [lossmh, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
       W1[i,j] = old_w + h
       [lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
       W1[i,j] = old_w  # reset W[i,j]
       dWij_num = (lossph-lossmh) / (2*h)  # numerical derivative

       # Check error
       rel_error = test_util::check_rel_grad_error(as.scalar(dW1[i,j]), dWij_num, lossph, lossmh)
     }
   }

   print(" - Grad checking W2.")
   for (i in 1:nrow(W2)) {
     for (j in 1:ncol(W2)) {
       # Compute numerical derivative
       old_w = as.scalar(W2[i,j])
       W2[i,j] = old_w - h
       [lossmh, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
       W2[i,j] = old_w + h
       [lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
       W2[i,j] = old_w  # reset W[i,j]
       dWij_num = (lossph-lossmh) / (2*h)  # numerical derivative

       # Check error
       rel_error = test_util::check_rel_grad_error(as.scalar(dW2[i,j]), dWij_num, lossph, lossmh)
     }
   }

   print(" - Grad checking b1.")
   for (i in 1:nrow(b1)) {
     for (j in 1:ncol(b1)) {
       # Compute numerical derivative
       old_b = as.scalar(b1[i,j])
       b1[i,j] = old_b - h
       [lossmh, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
       b1[i,j] = old_b + h
       [lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
       b1[i,j] = old_b  # reset b[1,j]
       dbij_num = (lossph-lossmh) / (2*h)  # numerical derivative

       # Check error
       rel_error = test_util::check_rel_grad_error(as.scalar(db1[i,j]), dbij_num, lossph, lossmh)
     }
   }

   print(" - Grad checking b2.")
   for (i in 1:nrow(b2)) {
     for (j in 1:ncol(b2)) {
       # Compute numerical derivative
       old_b = as.scalar(b2[i,j])
       b2[i,j] = old_b - h
       [lossmh, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
       b2[i,j] = old_b + h
       [lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
       b2[i,j] = old_b  # reset b[1,j]
       dbij_num = (lossph-lossmh) / (2*h)  # numerical derivative

       # Check error
       rel_error = test_util::check_rel_grad_error(as.scalar(db2[i,j]), dbij_num, lossph, lossmh)
     }
   }
 }

 /*
  * Test network with forward/backward functions.
  */
 two_layer_affine_l2_net_run = function(matrix[double] X, matrix[double] y,
                                        matrix[double] W1, matrix[double] b1,
                                        matrix[double] W2, matrix[double] b2)
     return (matrix[double] pred, double loss,
             matrix[double] dX,
             matrix[double] dW1, matrix[double] db1,
             matrix[double] dW2, matrix[double] db2) {
   # Compute forward pass
   [loss, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)

   # Compute backward pass
   [dX, dpred, daout, dhout, dW1, db1, dW2, db2] =
       two_layer_affine_l2_net_backward(X, y, pred, aout, hout, W1, b1, W2, b2)
 }

 two_layer_affine_l2_net_forward = function(matrix[double] X, matrix[double] y,
                                            matrix[double] W1, matrix[double] b1,
                                            matrix[double] W2, matrix[double] b2)
     return (double loss, matrix[double] pred, matrix[double] aout, matrix[double] hout) {
   # Compute forward pass
   hout = affine::forward(X, W1, b1)
   aout = relu::forward(hout)
   pred = affine::forward(aout, W2, b2)

   # Compute loss
   loss = l2_loss::forward(pred, y)
 }

 two_layer_affine_l2_net_backward = function(matrix[double] X, matrix[double] y, matrix[double] pred,
                                             matrix[double] aout, matrix[double] hout,
                                             matrix[double] W1, matrix[double] b1,
                                             matrix[double] W2, matrix[double] b2)
     return (matrix[double] dX, matrix[double] dpred,
             matrix[double] daout, matrix[double] dhout,
             matrix[double] dW1, matrix[double] db1, matrix[double] dW2, matrix[double] db2) {
   # Compute backward pass
   dpred = l2_loss::backward(pred, y)
   [daout, dW2, db2] = affine::backward(dpred, aout, W2, b2)
   dhout = relu::backward(daout, hout)
   [dX, dW1, db1] = affine::backward(dhout, X, W1, b1)
 }

 elu = function() {
   /*
    * Gradient check for ELU nonlinearity
    * layer.
    */
    print("Grad checking ELU nonlinearity layer with L2 loss.")
    # Generate data
    N = 3 # num examples
    M = 10 # num neurons

    X = rand(rows=N, cols=M, min=-5, max=5)
    y = rand(rows=N, cols=M)

    out = elu::forward(X, 1)
    dout = l2_loss::backward(out, y)
    dX = elu::backward(dout, X, 1)

    # Grad check
    h = 1e-5
    print(" - Grad checking X.")
    for (i in 1:nrow(X)) {
      for (j in 1:ncol(X)) {
        # Compute numerical derivative
        old = as.scalar(X[i,j])
        X[i,j] = old - h
        outmh = elu::forward(X, 1)
        lossmh = l2_loss::forward(outmh, y)
        X[i,j] = old + h
        outph = elu::forward(X, 1)
        lossph = l2_loss::forward(outph, y)
        X[i,j] = old  # reset
        dX_num = (lossph-lossmh) / (2*h)  # numerical derivative

        # Check error
        rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
      }
    }
 }