projects/breast_cancer/convnet.dml - systemds - Git at Google

 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 #-------------------------------------------------------------

 /*
  * Breast Cancer LeNet-like ConvNet Model
  */
 # Imports
 source("nn/layers/affine.dml") as affine
 source("nn/layers/conv2d_builtin.dml") as conv2d
 source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
 source("nn/layers/dropout.dml") as dropout
 source("nn/layers/l2_reg.dml") as l2_reg
 source("nn/layers/max_pool2d_builtin.dml") as max_pool2d
 source("nn/layers/relu.dml") as relu
 source("nn/layers/softmax.dml") as softmax
 #source("nn/optim/adam.dml") as adam
 source("nn/optim/sgd_nesterov.dml") as sgd_nesterov

 train = function(matrix[double] X, matrix[double] Y,
                  matrix[double] X_val, matrix[double] Y_val,
                  int C, int Hin, int Win,
                  double lr, double mu, double decay, double lambda,
                  int batch_size, int epochs, int log_interval,
                  string checkpoint_dir)
     return (matrix[double] Wc1, matrix[double] bc1,
             matrix[double] Wc2, matrix[double] bc2,
             matrix[double] Wc3, matrix[double] bc3,
             matrix[double] Wa1, matrix[double] ba1,
             matrix[double] Wa2, matrix[double] ba2) {
   /*
    * Trains a convolutional net using a "LeNet"-like architecture.
    *
    * The input matrix, X, has N examples, each represented as a 3D
    * volume unrolled into a single vector.  The targets, Y, have K
    * classes, and are one-hot encoded.
    *
    * Inputs:
    *  - X: Input data matrix, of shape (N, C*Hin*Win).
    *  - Y: Target matrix, of shape (N, K).
    *  - X_val: Input validation data matrix, of shape (N, C*Hin*Win).
    *  - Y_val: Target validation matrix, of shape (N, K).
    *  - C: Number of input channels (dimensionality of input depth).
    *  - Hin: Input height.
    *  - Win: Input width.
    *  - lr: Learning rate.
    *  - mu: Momentum value.
    *      Typical values are in the range of [0.5, 0.99], usually
    *      started at the lower end and annealed towards the higher end.
    *  - decay: Learning rate decay rate.
    *  - lambda: Regularization strength.
    *  - batch_size: Size of mini-batches to train on.
    *  - epochs: Total number of full training loops over the full data set.
    *  - log_interval: Interval, in iterations, between log outputs.
    *  - checkpoint_dir: Directory to store model checkpoints.
    *
    * Outputs:
    *  - Wc1: 1st layer weights (parameters) matrix, of shape (F1, C*Hf*Wf).
    *  - bc1: 1st layer biases vector, of shape (F1, 1).
    *  - Wc2: 2nd layer weights (parameters) matrix, of shape (F2, F1*Hf*Wf).
    *  - bc2: 2nd layer biases vector, of shape (F2, 1).
    *  - Wc3: 3rd layer weights (parameters) matrix, of shape (F2*(Hin/4)*(Win/4), N3).
    *  - bc3: 3rd layer biases vector, of shape (1, N3).
    *  - Wa2: 4th layer weights (parameters) matrix, of shape (N3, K).
    *  - ba2: 4th layer biases vector, of shape (1, K).
    */
   N = nrow(X)
   K = ncol(Y)

   # Create network:
   # conv1 -> relu1 -> pool1 -> conv2 -> relu2 -> pool2 -> conv3 -> relu3 -> pool3
   #  -> affine1 -> relu1 -> dropout1 -> affine2 -> softmax
   Hf = 3  # filter height
   Wf = 3  # filter width
   stride = 1
   pad = 1  # For same dimensions, (Hf - stride) / 2
   F1 = 32  # num conv filters in conv1
   F2 = 32  # num conv filters in conv2
   F3 = 32  # num conv filters in conv3
   N1 = 512  # num nodes in affine1
   # Note: affine2 has K nodes, which is equal to the number of target dimensions (num classes)
   [Wc1, bc1] = conv2d::init(F1, C, Hf, Wf)  # inputs: (N, C*Hin*Win)
   [Wc2, bc2] = conv2d::init(F2, F1, Hf, Wf)  # inputs: (N, F1*(Hin/2)*(Win/2))
   [Wc3, bc3] = conv2d::init(F3, F2, Hf, Wf)  # inputs: (N, F2*(Hin/2^2)*(Win/2^2))
   [Wa1, ba1] = affine::init(F3*(Hin/2^3)*(Win/2^3), N1)  # inputs: (N, F3*(Hin/2^3)*(Win/2^3))
   [Wa2, ba2] = affine::init(N1, K)  # inputs: (N, N1)
   Wa2 = Wa2 / sqrt(2)  # different initialization, since being fed into softmax, instead of relu

   # TODO: Compare optimizers once training is faster.
   # Initialize SGD w/ Nesterov momentum optimizer
   vWc1 = sgd_nesterov::init(Wc1); vbc1 = sgd_nesterov::init(bc1)
   vWc2 = sgd_nesterov::init(Wc2); vbc2 = sgd_nesterov::init(bc2)
   vWc3 = sgd_nesterov::init(Wc3); vbc3 = sgd_nesterov::init(bc3)
   vWa1 = sgd_nesterov::init(Wa1); vba1 = sgd_nesterov::init(ba1)
   vWa2 = sgd_nesterov::init(Wa2); vba2 = sgd_nesterov::init(ba2)
   #[mWc1, vWc1] = adam::init(Wc1)  # optimizer 1st & 2nd moment state for Wc1
   #[mbc1, vbc1] = adam::init(bc1)  # optimizer 1st & 2nd moment state for bc1
   #[mWc2, vWc2] = adam::init(Wc2)  # optimizer 1st & 2nd moment state for Wc2
   #[mbc2, vbc2] = adam::init(bc2)  # optimizer 1st & 2nd moment state for bc2
   #[mWc3, vWc3] = adam::init(Wc3)  # optimizer 1st & 2nd moment state for Wc3
   #[mbc3, vbc3] = adam::init(bc3)  # optimizer 1st & 2nd moment state for bc3
   #[mWa1, vWa1] = adam::init(Wa1)  # optimizer 1st & 2nd moment state for Wa1
   #[mba1, vba1] = adam::init(ba1)  # optimizer 1st & 2nd moment state for ba1
   #[mWa2, vWa2] = adam::init(Wa2)  # optimizer 1st & 2nd moment state for Wa2
   #[mba2, vba2] = adam::init(ba2)  # optimizer 1st & 2nd moment state for ba2
   #beta1 = 0.9
   #beta2 = 0.999
   #eps = 1e-8

   # TODO: Enable starting val metrics once fast, distributed predictions are available.
   # Starting validation loss & accuracy
   #probs_val = predict(X_val, C, Hin, Win, Wc1, bc1, Wc2, bc2, Wc3, bc3, Wa1, ba1, Wa2, ba2)
   #loss_val = cross_entropy_loss::forward(probs_val, Y_val)
   #accuracy_val = mean(rowIndexMax(probs_val) == rowIndexMax(Y_val))
   ## Output results
   #print("Start: Val Loss: " + loss_val + ", Val Accuracy: " + accuracy_val)

   # Optimize
   print("Starting optimization")
   iters = ceil(N / batch_size)
   for (e in 1:epochs) {
     for(i in 1:iters) {
       # Get next batch
       beg = ((i-1) * batch_size) %% N + 1
       end = min(N, beg + batch_size - 1)
       X_batch = X[beg:end,]
       y_batch = Y[beg:end,]

       # Compute forward pass
       ## conv layer 1: conv1 -> relu1 -> pool1
       [outc1, Houtc1, Woutc1] = conv2d::forward(X_batch, Wc1, bc1, C, Hin, Win, Hf, Wf,
                                                 stride, stride, pad, pad)
       outc1r = relu::forward(outc1)
       [outc1p, Houtc1p, Woutc1p] = max_pool2d::forward(outc1r, F1, Houtc1, Woutc1, Hf=2, Wf=2,
                                                        strideh=2, stridew=2, 0, 0)
       ## conv layer 2: conv2 -> relu2 -> pool2
       [outc2, Houtc2, Woutc2] = conv2d::forward(outc1p, Wc2, bc2, F1, Houtc1p, Woutc1p, Hf, Wf,
                                                 stride, stride, pad, pad)
       outc2r = relu::forward(outc2)
       [outc2p, Houtc2p, Woutc2p] = max_pool2d::forward(outc2r, F2, Houtc2, Woutc2, Hf=2, Wf=2,
                                                        strideh=2, stridew=2, 0, 0)
       ## conv layer 3: conv3 -> relu3 -> pool3
       [outc3, Houtc3, Woutc3] = conv2d::forward(outc2p, Wc3, bc3, F2, Houtc2p, Woutc2p, Hf, Wf,
                                                 stride, stride, pad, pad)
       outc3r = relu::forward(outc3)
       [outc3p, Houtc3p, Woutc3p] = max_pool2d::forward(outc3r, F3, Houtc3, Woutc3, Hf=2, Wf=2,
                                                        strideh=2, stridew=2, 0, 0)
       ## affine layer 1:  affine1 -> relu1 -> dropout1
       outa1 = affine::forward(outc3p, Wa1, ba1)
       outa1r = relu::forward(outa1)
       [outa1d, maskad1] = dropout::forward(outa1r, 0.5, -1)
       ## affine layer 2:  affine2 -> softmax
       outa2 = affine::forward(outa1d, Wa2, ba2)
       probs = softmax::forward(outa2)

       # Compute data backward pass
       ## loss:
       dprobs = cross_entropy_loss::backward(probs, y_batch)
       ## affine layer 2:  affine2 -> softmax
       douta2 = softmax::backward(dprobs, outa2)
       [douta1d, dWa2, dba2] = affine::backward(douta2, outa1d, Wa2, ba2)
       ## layer 3:  affine3 -> relu3 -> dropout
       ## affine layer 1:  affine1 -> relu1 -> dropout
       douta1r = dropout::backward(douta1d, outa1r, 0.5, maskad1)
       douta1 = relu::backward(douta1r, outa1)
       [doutc3p, dWa1, dba1] = affine::backward(douta1, outc3p, Wa1, ba1)
       ## conv layer 3: conv3 -> relu3 -> pool3
       doutc3r = max_pool2d::backward(doutc3p, Houtc3p, Woutc3p, outc3r, F3, Houtc3, Woutc3,
                                      Hf=2, Wf=2, strideh=2, stridew=2, 0, 0)
       doutc3 = relu::backward(doutc3r, outc3)
       [doutc2p, dWc3, dbc3] = conv2d::backward(doutc3, Houtc3, Woutc3, outc2p, Wc3, bc2, F2,
                                                Houtc2p, Woutc2p, Hf, Wf, stride, stride, pad, pad)
       ## conv layer 2: conv2 -> relu2 -> pool2
       doutc2r = max_pool2d::backward(doutc2p, Houtc2p, Woutc2p, outc2r, F2, Houtc2, Woutc2,
                                      Hf=2, Wf=2, strideh=2, stridew=2, 0, 0)
       doutc2 = relu::backward(doutc2r, outc2)
       [doutc1p, dWc2, dbc2] = conv2d::backward(doutc2, Houtc2, Woutc2, outc1p, Wc2, bc2, F1,
                                                Houtc1p, Woutc1p, Hf, Wf, stride, stride, pad, pad)
       ## conv layer 1: conv1 -> relu1 -> pool1
       doutc1r = max_pool2d::backward(doutc1p, Houtc1p, Woutc1p, outc1r, F1, Houtc1, Woutc1,
                                      Hf=2, Wf=2, strideh=2, stridew=2, 0, 0)
       doutc1 = relu::backward(doutc1r, outc1)
       [dX_batch, dWc1, dbc1] = conv2d::backward(doutc1, Houtc1, Woutc1, X_batch, Wc1, bc1, C,
                                                 Hin, Win, Hf, Wf, stride, stride, pad, pad)

       # Compute regularization backward pass
       dWc1_reg = l2_reg::backward(Wc1, lambda)
       dWc2_reg = l2_reg::backward(Wc2, lambda)
       dWc3_reg = l2_reg::backward(Wc3, lambda)
       dWa1_reg = l2_reg::backward(Wa1, lambda)
       dWa2_reg = l2_reg::backward(Wa2, lambda)
       dWc1 = dWc1 + dWc1_reg
       dWc2 = dWc2 + dWc2_reg
       dWc3 = dWc3 + dWc3_reg
       dWa1 = dWa1 + dWa1_reg
       dWa2 = dWa2 + dWa2_reg

       # Optimize with SGD w/ Nesterov momentum
       [Wc1, vWc1] = sgd_nesterov::update(Wc1, dWc1, lr, mu, vWc1)
       [bc1, vbc1] = sgd_nesterov::update(bc1, dbc1, lr, mu, vbc1)
       [Wc2, vWc2] = sgd_nesterov::update(Wc2, dWc2, lr, mu, vWc2)
       [bc2, vbc2] = sgd_nesterov::update(bc2, dbc2, lr, mu, vbc2)
       [Wc3, vWc3] = sgd_nesterov::update(Wc3, dWc3, lr, mu, vWc3)
       [bc3, vbc3] = sgd_nesterov::update(bc3, dbc3, lr, mu, vbc3)
       [Wa1, vWa1] = sgd_nesterov::update(Wa1, dWa1, lr, mu, vWa1)
       [ba1, vba1] = sgd_nesterov::update(ba1, dba1, lr, mu, vba1)
       [Wa2, vWa2] = sgd_nesterov::update(Wa2, dWa2, lr, mu, vWa2)
       [ba2, vba2] = sgd_nesterov::update(ba2, dba2, lr, mu, vba2)
       #t = e*i - 1
       #[Wc1, mWc1, vWc1] = adam::update(Wc1, dWc1, lr, beta1, beta2, eps, t, mWc1, vWc1)
       #[bc1, mbc1, vbc1] = adam::update(bc1, dbc1, lr, beta1, beta2, eps, t, mbc1, vbc1)
       #[Wc2, mWc2, vWc2] = adam::update(Wc2, dWc2, lr, beta1, beta2, eps, t, mWc2, vWc2)
       #[bc2, mbc2, vbc2] = adam::update(bc2, dbc2, lr, beta1, beta2, eps, t, mbc2, vbc2)
       #[Wc3, mWc3, vWc3] = adam::update(Wc3, dWc3, lr, beta1, beta2, eps, t, mWc3, vWc3)
       #[bc3, mbc3, vbc3] = adam::update(bc3, dbc3, lr, beta1, beta2, eps, t, mbc3, vbc3)
       #[Wa1, mWa1, vWa1] = adam::update(Wa1, dWa1, lr, beta1, beta2, eps, t, mWa1, vWa1)
       #[ba1, mba1, vba1] = adam::update(ba1, dba1, lr, beta1, beta2, eps, t, mba1, vba1)
       #[Wa2, mWa2, vWa2] = adam::update(Wa2, dWa2, lr, beta1, beta2, eps, t, mWa2, vWa2)
       #[ba2, mba2, vba2] = adam::update(ba2, dba2, lr, beta1, beta2, eps, t, mba2, vba2)

       # Compute loss & accuracy for training & validation data every `log_interval` iterations.
       if (i %% log_interval == 0) {
         # Compute training loss & accuracy
         loss_data = cross_entropy_loss::forward(probs, y_batch)
         loss_reg_Wc1 = l2_reg::forward(Wc1, lambda)
         loss_reg_Wc2 = l2_reg::forward(Wc2, lambda)
         loss_reg_Wc3 = l2_reg::forward(Wc3, lambda)
         loss_reg_Wa1 = l2_reg::forward(Wa1, lambda)
         loss_reg_Wa2 = l2_reg::forward(Wa2, lambda)
         loss = loss_data + loss_reg_Wc1 + loss_reg_Wc2 + loss_reg_Wc3 + loss_reg_Wa1 + loss_reg_Wa2
         accuracy = mean(rowIndexMax(probs) == rowIndexMax(y_batch))

         # TODO: Consider enabling val metrics here once fast, distributed predictions are available.
         ## Compute validation loss & accuracy
         #probs_val = predict(X_val, C, Hin, Win, Wc1, bc1, Wc2, bc2, Wc3, bc3, Wa1, ba1, Wa2, ba2)
         #loss_val = cross_entropy_loss::forward(probs_val, Y_val)
         #accuracy_val = mean(rowIndexMax(probs_val) == rowIndexMax(Y_val))

         ## Output results
         #print("Epoch: " + e + ", Iter: " + i + ", Train Loss: " + loss + ", Train Accuracy: "
         #      + accuracy + ", Val Loss: " + loss_val + ", Val Accuracy: " + accuracy_val
         #      + ", lr: " + lr + ", mu " + mu)
         # Output results
         print("Epoch: " + e + "/" + epochs + ", Iter: " + i + "/" + iters
               + ", Train Loss: " + loss + ", Train Accuracy: " + accuracy)
       }
     }

     # Compute validation loss & accuracy for validation data every epoch
     probs_val = predict(X_val, C, Hin, Win, Wc1, bc1, Wc2, bc2, Wc3, bc3, Wa1, ba1, Wa2, ba2)
     loss_val = cross_entropy_loss::forward(probs_val, Y_val)
     accuracy_val = mean(rowIndexMax(probs_val) == rowIndexMax(Y_val))

     # Output results
     print("Epoch: " + e + "/" + epochs + ", Val Loss: " + loss_val
           + ", Val Accuracy: " + accuracy_val + ", lr: " + lr + ", mu " + mu)

     # Checkpoint model
     dir = checkpoint_dir + e + "/"
     dummy = checkpoint(dir, Wc1, bc1, Wc2, bc2, Wc3, bc3, Wa1, ba1, Wa2, ba2)
     str = "lr: " + lr + ", mu: " + mu + ", decay: " + decay + ", lambda: " + lambda
           + ", batch_size: " + batch_size
     name = dir + accuracy_val
     write(str, name)

     # Anneal momentum towards 0.999
     mu = mu + (0.999 - mu)/(1+epochs-e)
     # Decay learning rate
     lr = lr * decay
   }
 }

 checkpoint = function(string dir,
                       matrix[double] Wc1, matrix[double] bc1,
                       matrix[double] Wc2, matrix[double] bc2,
                       matrix[double] Wc3, matrix[double] bc3,
                       matrix[double] Wa1, matrix[double] ba1,
                       matrix[double] Wa2, matrix[double] ba2) {
   /*
    * Save the model parameters.
    *
    * Inputs:
    *  - dir: Directory in which to save model parameters.
    *  - Wc1: 1st conv layer weights (parameters) matrix, of shape (F1, C*Hf*Wf).
    *  - bc1: 1st conv layer biases vector, of shape (F1, 1).
    *  - Wc2: 2nd conv layer weights (parameters) matrix, of shape (F2, F1*Hf*Wf).
    *  - bc2: 2nd conv layer biases vector, of shape (F2, 1).
    *  - Wc3: 3rd conv layer weights (parameters) matrix, of shape (F3, F2*Hf*Wf).
    *  - bc3: 3rd conv layer biases vector, of shape (F3, 1).
    *  - Wa1: 1st affine layer weights (parameters) matrix, of shape (F3*(Hin/2^3)*(Win/2^1), N1).
    *  - ba1: 1st affine layer biases vector, of shape (1, N1).
    *  - Wa2: 2nd affine layer weights (parameters) matrix, of shape (N1, K).
    *  - ba2: 2nd affine layer biases vector, of shape (1, K).
    *
    * Outputs:
    *  - probs: Class probabilities, of shape (N, K).
    */
   write(Wc1, dir + "Wc1", format="binary")
   write(bc1, dir + "bc1", format="binary")
   write(Wc2, dir + "Wc2", format="binary")
   write(bc2, dir + "bc2", format="binary")
   write(Wc3, dir + "Wc3", format="binary")
   write(bc3, dir + "bc3", format="binary")
   write(Wa1, dir + "Wa1", format="binary")
   write(ba1, dir + "ba1", format="binary")
   write(Wa2, dir + "Wa2", format="binary")
   write(ba2, dir + "ba2", format="binary")
 }

 predict = function(matrix[double] X, int C, int Hin, int Win,
                    matrix[double] Wc1, matrix[double] bc1,
                    matrix[double] Wc2, matrix[double] bc2,
                    matrix[double] Wc3, matrix[double] bc3,
                    matrix[double] Wa1, matrix[double] ba1,
                    matrix[double] Wa2, matrix[double] ba2)
     return (matrix[double] probs) {
   /*
    * Computes the class probability predictions of a convolutional
    * net using the "LeNet" architecture.
    *
    * The input matrix, X, has N examples, each represented as a 3D
    * volume unrolled into a single vector.
    *
    * Inputs:
    *  - X: Input data matrix, of shape (N, C*Hin*Win).
    *  - C: Number of input channels (dimensionality of input depth).
    *  - Hin: Input height.
    *  - Win: Input width.
    *  - Wc1: 1st conv layer weights (parameters) matrix, of shape (F1, C*Hf*Wf).
    *  - bc1: 1st conv layer biases vector, of shape (F1, 1).
    *  - Wc2: 2nd conv layer weights (parameters) matrix, of shape (F2, F1*Hf*Wf).
    *  - bc2: 2nd conv layer biases vector, of shape (F2, 1).
    *  - Wc3: 3rd conv layer weights (parameters) matrix, of shape (F3, F2*Hf*Wf).
    *  - bc3: 3rd conv layer biases vector, of shape (F3, 1).
    *  - Wa1: 1st affine layer weights (parameters) matrix, of shape (F3*(Hin/2^3)*(Win/2^1), N1).
    *  - ba1: 1st affine layer biases vector, of shape (1, N1).
    *  - Wa2: 2nd affine layer weights (parameters) matrix, of shape (N1, K).
    *  - ba2: 2nd affine layer biases vector, of shape (1, K).
    *
    * Outputs:
    *  - probs: Class probabilities, of shape (N, K).
    */
   N = nrow(X)

   # Network:
   # conv1 -> relu1 -> pool1 -> conv2 -> relu2 -> pool2 -> conv3 -> relu3 -> pool3
   #  -> affine1 -> relu1 -> affine2 -> softmax
   Hf = 3  # filter height
   Wf = 3  # filter width
   stride = 1
   pad = 1  # For same dimensions, (Hf - stride) / 2

   F1 = nrow(Wc1)  # num conv filters in conv1
   F2 = nrow(Wc2)  # num conv filters in conv2
   F3 = nrow(Wc3)  # num conv filters in conv3
   N1 = ncol(Wa1)  # num nodes in affine1
   K = ncol(Wa2)  # num nodes in affine2, equal to number of target dimensions (num classes)

   # TODO: Implement fast, distributed conv & max pooling operators so that predictions
   # can be computed in a full-batch, distributed manner.  Alternatively, improve `parfor`
   # so that it can be efficiently used for parallel predictions.
   ## Compute forward pass
   ### conv layer 1: conv1 -> relu1 -> pool1
   #[outc1, Houtc1, Woutc1] = conv2d::forward(X, Wc1, bc1, C, Hin, Win, Hf, Wf, stride, stride,
   #                                          pad, pad)
   #outc1r = relu::forward(outc1)
   #[outc1p, Houtc1p, Woutc1p] = max_pool2d::forward(outc1r, F1, Houtc1, Woutc1, Hf=2, Wf=2,
   #                                                 strideh=2, stridew=2, 0, 0)
   ### conv layer 2: conv2 -> relu2 -> pool2
   #[outc2, Houtc2, Woutc2] = conv2d::forward(outc1p, Wc2, bc2, F1, Houtc1p, Woutc1p, Hf, Wf,
   #                                          stride, stride, pad, pad)
   #outc2r = relu::forward(outc2)
   #[outc2p, Houtc2p, Woutc2p] = max_pool2d::forward(outc2r, F2, Houtc2, Woutc2, Hf=2, Wf=2,
   #                                                 strideh=2, stridew=2, 0, 0)
   ### conv layer 3: conv3 -> relu3 -> pool3
   #[outc3, Houtc3, Woutc3] = conv2d::forward(outc2p, Wc3, bc3, F2, Houtc2p, Woutc2p, Hf, Wf,
   #                                          stride, stride, pad, pad)
   #outc3r = relu::forward(outc3)
   #[outc3p, Houtc3p, Woutc3p] = max_pool2d::forward(outc3r, F3, Houtc3, Woutc3, Hf=2, Wf=2,
   #                                                 strideh=2, stridew=2, 0, 0)
   ### affine layer 1:  affine1 -> relu1 -> dropout
   #outa1 = affine::forward(outc3p, Wa1, ba1)
   #outa1r = relu::forward(outa1)
   ##[outa1d, maskad1] = dropout::forward(outa1r, 0.5, -1)
   ### affine layer 2:  affine2 -> softmax
   #outa2 = affine::forward(outa1r, Wa2, ba2)
   #probs = softmax::forward(outa2)

   # Compute predictions over mini-batches
   probs = matrix(0, rows=N, cols=K)
   batch_size = 50
   iters = ceil(N / batch_size)
   for(i in 1:iters) {
   # TODO: `parfor` should work here, possibly as an alternative to distributed predictions.
   #parfor(i in 1:iters, check=0, mode=REMOTE_SPARK, resultmerge=REMOTE_SPARK) {
     # Get next batch
     beg = ((i-1) * batch_size) %% N + 1
     end = min(N, beg + batch_size - 1)
     X_batch = X[beg:end,]

     # Compute forward pass
     ## conv layer 1: conv1 -> relu1 -> pool1
     [outc1, Houtc1, Woutc1] = conv2d::forward(X_batch, Wc1, bc1, C, Hin, Win, Hf, Wf,
                                               stride, stride, pad, pad)
     outc1r = relu::forward(outc1)
     [outc1p, Houtc1p, Woutc1p] = max_pool2d::forward(outc1r, F1, Houtc1, Woutc1, Hf=2, Wf=2,
                                                      strideh=2, stridew=2, 0, 0)
     ## conv layer 2: conv2 -> relu2 -> pool2
     [outc2, Houtc2, Woutc2] = conv2d::forward(outc1p, Wc2, bc2, F1, Houtc1p, Woutc1p, Hf, Wf,
                                               stride, stride, pad, pad)
     outc2r = relu::forward(outc2)
     [outc2p, Houtc2p, Woutc2p] = max_pool2d::forward(outc2r, F2, Houtc2, Woutc2, Hf=2, Wf=2,
                                                      strideh=2, stridew=2, 0, 0)
     ## conv layer 3: conv3 -> relu3 -> pool3
     [outc3, Houtc3, Woutc3] = conv2d::forward(outc2p, Wc3, bc3, F2, Houtc2p, Woutc2p, Hf, Wf,
                                               stride, stride, pad, pad)
     outc3r = relu::forward(outc3)
     [outc3p, Houtc3p, Woutc3p] = max_pool2d::forward(outc3r, F3, Houtc3, Woutc3, Hf=2, Wf=2,
                                                      strideh=2, stridew=2, 0, 0)
     ## affine layer 1:  affine1 -> relu1 -> dropout
     outa1 = affine::forward(outc3p, Wa1, ba1)
     outa1r = relu::forward(outa1)
     #[outa1d, maskad1] = dropout::forward(outa1r, 0.5, -1)
     ## affine layer 2:  affine2 -> softmax
     outa2 = affine::forward(outa1r, Wa2, ba2)
     probs_batch = softmax::forward(outa2)

     # Store predictions
     probs[beg:end,] = probs_batch
   }
 }

 eval = function(matrix[double] probs, matrix[double] Y)
     return (double loss, double accuracy) {
   /*
    * Evaluates a convolutional net using the "LeNet" architecture.
    *
    * The probs matrix contains the class probability predictions
    * of K classes over N examples.  The targets, Y, have K classes,
    * and are one-hot encoded.
    *
    * Inputs:
    *  - probs: Class probabilities, of shape (N, K).
    *  - Y: Target matrix, of shape (N,
    *
    * Outputs:
    *  - loss: Scalar loss, of shape (1).
    *  - accuracy: Scalar accuracy, of shape (1).
    */
   # Compute loss & accuracy
   loss = cross_entropy_loss::forward(probs, Y)
   correct_pred = rowIndexMax(probs) == rowIndexMax(Y)
   accuracy = mean(correct_pred)
 }

 generate_dummy_data = function()
     return (matrix[double] X, matrix[double] Y, int C, int Hin, int Win) {
   /*
    * Generate a dummy dataset similar to the breast cancer dataset.
    *
    * Outputs:
    *  - X: Input data matrix, of shape (N, D).
    *  - Y: Target matrix, of shape (N, K).
    *  - C: Number of input channels (dimensionality of input depth).
    *  - Hin: Input height.
    *  - Win: Input width.
    */
   # Generate dummy input data
   N = 1024  # num examples
   C = 3  # num input channels
   Hin = 256  # input height
   Win = 256  # input width
   K = 3  # num target classes
   X = rand(rows=N, cols=C*Hin*Win, pdf="normal")
   classes = round(rand(rows=N, cols=1, min=1, max=K, pdf="uniform"))
   Y = table(seq(1, N), classes)  # one-hot encoding
 }