src/test/scripts/functions/federated/paramserv/CNN.dml - systemds - Git at Google

 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 #-------------------------------------------------------------

 /*
  * This file implements all needed functions to evaluate a convolutional neural network of the "LeNet" architecture
  * on different execution schemes and with different inputs, for example a federated input matrix.
  */

 # Imports
 source("scripts/nn/layers/affine.dml") as affine
 source("scripts/nn/layers/conv2d_builtin.dml") as conv2d
 source("scripts/nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
 source("scripts/nn/layers/dropout.dml") as dropout
 source("scripts/nn/layers/l2_reg.dml") as l2_reg
 source("scripts/nn/layers/max_pool2d_builtin.dml") as max_pool2d
 source("scripts/nn/layers/relu.dml") as relu
 source("scripts/nn/layers/softmax.dml") as softmax
 source("scripts/nn/optim/sgd_nesterov.dml") as sgd_nesterov

 /*
  * Trains a convolutional net using the "LeNet" architectur single threaded the conventional way.
  *
  * The input matrix, X, has N examples, each represented as a 3D
  * volume unrolled into a single vector.  The targets, Y, have K
  * classes, and are one-hot encoded.
  *
  * Inputs:
  *  - X: Input data matrix, of shape (N, C*Hin*Win)
  *  - y: Target matrix, of shape (N, K)
  *  - X_val: Input validation data matrix, of shape (N, C*Hin*Win)
  *  - y_val: Target validation matrix, of shape (N, K)
  *  - C: Number of input channels (dimensionality of input depth)
  *  - Hin: Input height
  *  - Win: Input width
  *  - epochs: Total number of full training loops over the full data set
  *  - batch_size: Batch size
  *  - learning_rate: The learning rate for the SGD
  *
  * Outputs:
  *  - model_trained: List containing
  *       - W1: 1st layer weights (parameters) matrix, of shape (F1, C*Hf*Wf)
  *       - b1: 1st layer biases vector, of shape (F1, 1)
  *       - W2: 2nd layer weights (parameters) matrix, of shape (F2, F1*Hf*Wf)
  *       - b2: 2nd layer biases vector, of shape (F2, 1)
  *       - W3: 3rd layer weights (parameters) matrix, of shape (F2*(Hin/4)*(Win/4), N3)
  *       - b3: 3rd layer biases vector, of shape (1, N3)
  *       - W4: 4th layer weights (parameters) matrix, of shape (N3, K)
  *       - b4: 4th layer biases vector, of shape (1, K)
  */
 train = function(matrix[double] X, matrix[double] y,
                  matrix[double] X_val, matrix[double] y_val,
                  int epochs, int batch_size, double eta,
                  int C, int Hin, int Win,
                  int seed = -1)
     return (list[unknown] model) {

   N = nrow(X)
   K = ncol(y)

   # Create network:
   ## input -> conv1 -> relu1 -> pool1 -> conv2 -> relu2 -> pool2 -> affine3 -> relu3 -> affine4 -> softmax
   Hf = 5  # filter height
   Wf = 5  # filter width
   stride = 1
   pad = 2  # For same dimensions, (Hf - stride) / 2
   F1 = 32  # num conv filters in conv1
   F2 = 64  # num conv filters in conv2
   N3 = 512  # num nodes in affine3
   # Note: affine4 has K nodes, which is equal to the number of target dimensions (num classes)

   [W1, b1] = conv2d::init(F1, C, Hf, Wf, seed = seed)  # inputs: (N, C*Hin*Win)
   lseed = ifelse(seed==-1, -1, seed + 1);
   [W2, b2] = conv2d::init(F2, F1, Hf, Wf, seed = lseed)  # inputs: (N, F1*(Hin/2)*(Win/2))
   lseed = ifelse(seed==-1, -1, seed + 2);
   [W3, b3] = affine::init(F2*(Hin/2/2)*(Win/2/2), N3, seed = lseed)  # inputs: (N, F2*(Hin/2/2)*(Win/2/2))
   lseed = ifelse(seed==-1, -1, seed + 3);
   [W4, b4] = affine::init(N3, K, seed = lseed)  # inputs: (N, N3)
   W4 = W4 / sqrt(2)  # different initialization, since being fed into softmax, instead of relu

   # Initialize SGD w/ Nesterov momentum optimizer
   mu = 0.9  # momentum
   decay = 0.95  # learning rate decay constant
   vW1 = sgd_nesterov::init(W1); vb1 = sgd_nesterov::init(b1)
   vW2 = sgd_nesterov::init(W2); vb2 = sgd_nesterov::init(b2)
   vW3 = sgd_nesterov::init(W3); vb3 = sgd_nesterov::init(b3)
   vW4 = sgd_nesterov::init(W4); vb4 = sgd_nesterov::init(b4)

   model = list(W1, W2, W3, W4, b1, b2, b3, b4, vW1, vW2, vW3, vW4, vb1, vb2, vb3, vb4)

   # Regularization
   lambda = 5e-04

   # Create the hyper parameter list
   hyperparams = list(learning_rate=eta, mu=mu, decay=decay, C=C, Hin=Hin, Win=Win, Hf=Hf, Wf=Wf, stride=stride, pad=pad, lambda=lambda, F1=F1, F2=F2, N3=N3)
   # Calculate iterations
   iters = ceil(N / batch_size)

   for (e in 1:epochs) {
     for(i in 1:iters) {
       # Get next batch
       beg = ((i-1) * batch_size) %% N + 1
       end = min(N, beg + batch_size - 1)
       X_batch = X[beg:end,]
       y_batch = y[beg:end,]

       gradients_list = gradients(model, hyperparams, X_batch, y_batch)
       model = aggregation(model, hyperparams, gradients_list)
     }
   }
 }

 /*
  * Trains a convolutional net using the "LeNet" architecture using a parameter server with specified properties.
  *
  * The input matrix, X, has N examples, each represented as a 3D
  * volume unrolled into a single vector.  The targets, Y, have K
  * classes, and are one-hot encoded.
  *
  * Inputs:
  *  - X: Input data matrix, of shape (N, C*Hin*Win)
  *  - Y: Target matrix, of shape (N, K)
  *  - X_val: Input validation data matrix, of shape (N, C*Hin*Win)
  *  - Y_val: Target validation matrix, of shape (N, K)
  *  - C: Number of input channels (dimensionality of input depth)
  *  - Hin: Input height
  *  - Win: Input width
  *  - epochs: Total number of full training loops over the full data set
  *  - batch_size: Batch size
  *  - learning_rate: The learning rate for the SGD
  *  - workers: Number of workers to create
  *  - utype: parameter server framework to use
  *  - scheme: update schema
  *  - mode: local or distributed
  *
  * Outputs:
  *  - model_trained: List containing
  *       - W1: 1st layer weights (parameters) matrix, of shape (F1, C*Hf*Wf)
  *       - b1: 1st layer biases vector, of shape (F1, 1)
  *       - W2: 2nd layer weights (parameters) matrix, of shape (F2, F1*Hf*Wf)
  *       - b2: 2nd layer biases vector, of shape (F2, 1)
  *       - W3: 3rd layer weights (parameters) matrix, of shape (F2*(Hin/4)*(Win/4), N3)
  *       - b3: 3rd layer biases vector, of shape (1, N3)
  *       - W4: 4th layer weights (parameters) matrix, of shape (N3, K)
  *       - b4: 4th layer biases vector, of shape (1, K)
  */
 train_paramserv = function(matrix[double] X, matrix[double] y,
                  matrix[double] X_val, matrix[double] y_val,
                  int num_workers, int epochs, string utype, string freq, int batch_size, string scheme, string runtime_balancing,
                  double eta, int C, int Hin, int Win,
                  int seed = -1)
     return (list[unknown] model) {

   N = nrow(X)
   K = ncol(y)

   # Create network:
   ## input -> conv1 -> relu1 -> pool1 -> conv2 -> relu2 -> pool2 -> affine3 -> relu3 -> affine4 -> softmax
   Hf = 5  # filter height
   Wf = 5  # filter width
   stride = 1
   pad = 2  # For same dimensions, (Hf - stride) / 2
   F1 = 32  # num conv filters in conv1
   F2 = 64  # num conv filters in conv2
   N3 = 512  # num nodes in affine3
   # Note: affine4 has K nodes, which is equal to the number of target dimensions (num classes)

   [W1, b1] = conv2d::init(F1, C, Hf, Wf, seed = seed)  # inputs: (N, C*Hin*Win)
   lseed = ifelse(seed==-1, -1, seed + 1);
   [W2, b2] = conv2d::init(F2, F1, Hf, Wf, seed = lseed)  # inputs: (N, F1*(Hin/2)*(Win/2))
   lseed = ifelse(seed==-1, -1, seed + 2);
   [W3, b3] = affine::init(F2*(Hin/2/2)*(Win/2/2), N3, seed = lseed)  # inputs: (N, F2*(Hin/2/2)*(Win/2/2))
   lseed = ifelse(seed==-1, -1, seed + 3);
   [W4, b4] = affine::init(N3, K, seed = lseed)  # inputs: (N, N3)
   W4 = W4 / sqrt(2)  # different initialization, since being fed into softmax, instead of relu

   # Initialize SGD w/ Nesterov momentum optimizer
   learning_rate = eta  # learning rate
   mu = 0.9  # momentum
   decay = 0.95  # learning rate decay constant
   vW1 = sgd_nesterov::init(W1); vb1 = sgd_nesterov::init(b1)
   vW2 = sgd_nesterov::init(W2); vb2 = sgd_nesterov::init(b2)
   vW3 = sgd_nesterov::init(W3); vb3 = sgd_nesterov::init(b3)
   vW4 = sgd_nesterov::init(W4); vb4 = sgd_nesterov::init(b4)
   # Regularization
   lambda = 5e-04
   # Create the model list
   model = list(W1, W2, W3, W4, b1, b2, b3, b4, vW1, vW2, vW3, vW4, vb1, vb2, vb3, vb4)
   # Create the hyper parameter list
   hyperparams = list(learning_rate=eta, mu=mu, decay=decay, C=C, Hin=Hin, Win=Win, Hf=Hf, Wf=Wf, stride=stride, pad=pad, lambda=lambda, F1=F1, F2=F2, N3=N3)

   # Use paramserv function
   model = paramserv(model=model, features=X, labels=y, val_features=X_val, val_labels=y_val,
     upd="./src/test/scripts/functions/federated/paramserv/CNN.dml::gradients",
     agg="./src/test/scripts/functions/federated/paramserv/CNN.dml::aggregation",
     k=num_workers, utype=utype, freq=freq, epochs=epochs, batchsize=batch_size,
     scheme=scheme, runtime_balancing=runtime_balancing, hyperparams=hyperparams)
 }

 /*
  * Computes the class probability predictions of a convolutional
  * net using the "LeNet" architecture.
  *
  * The input matrix, X, has N examples, each represented as a 3D
  * volume unrolled into a single vector.
  *
  * Inputs:
  *  - X: Input data matrix, of shape (N, C*Hin*Win)
  *  - C: Number of input channels (dimensionality of input depth)
  *  - Hin: Input height
  *  - Win: Input width
  *  - batch_size: Batch size
  *  - model: List containing
  *       - W1: 1st layer weights (parameters) matrix, of shape (F1, C*Hf*Wf)
  *       - b1: 1st layer biases vector, of shape (F1, 1)
  *       - W2: 2nd layer weights (parameters) matrix, of shape (F2, F1*Hf*Wf)
  *       - b2: 2nd layer biases vector, of shape (F2, 1)
  *       - W3: 3rd layer weights (parameters) matrix, of shape (F2*(Hin/4)*(Win/4), N3)
  *       - b3: 3rd layer biases vector, of shape (1, N3)
  *       - W4: 4th layer weights (parameters) matrix, of shape (N3, K)
  *       - b4: 4th layer biases vector, of shape (1, K)
  *
  * Outputs:
  *  - probs: Class probabilities, of shape (N, K)
  */
 predict = function(matrix[double] X, int C, int Hin, int Win, int batch_size, list[unknown] model)
     return (matrix[double] probs) {

   W1 = as.matrix(model[1])
   W2 = as.matrix(model[2])
   W3 = as.matrix(model[3])
   W4 = as.matrix(model[4])
   b1 = as.matrix(model[5])
   b2 = as.matrix(model[6])
   b3 = as.matrix(model[7])
   b4 = as.matrix(model[8])
   N = nrow(X)

   # Network:
   ## input -> conv1 -> relu1 -> pool1 -> conv2 -> relu2 -> pool2 -> affine3 -> relu3 -> affine4 -> softmax
   Hf = 5  # filter height
   Wf = 5  # filter width
   stride = 1
   pad = 2  # For same dimensions, (Hf - stride) / 2
   F1 = nrow(W1)  # num conv filters in conv1
   F2 = nrow(W2)  # num conv filters in conv2
   N3 = ncol(W3)  # num nodes in affine3
   K = ncol(W4)  # num nodes in affine4, equal to number of target dimensions (num classes)

   # Compute predictions over mini-batches
   probs = matrix(0, rows=N, cols=K)
   iters = ceil(N / batch_size)
   parfor(i in 1:iters, check=0) {
     # Get next batch
     beg = ((i-1) * batch_size) %% N + 1
     end = min(N, beg + batch_size - 1)
     X_batch = X[beg:end,]

     # Compute forward pass
     ## layer 1: conv1 -> relu1 -> pool1
     [outc1, Houtc1, Woutc1] = conv2d::forward(X_batch, W1, b1, C, Hin, Win, Hf, Wf, stride, stride,
                                               pad, pad)
     outr1 = relu::forward(outc1)
     [outp1, Houtp1, Woutp1] = max_pool2d::forward(outr1, F1, Houtc1, Woutc1, 2, 2, 2, 2, 0, 0)
     ## layer 2: conv2 -> relu2 -> pool2
     [outc2, Houtc2, Woutc2] = conv2d::forward(outp1, W2, b2, F1, Houtp1, Woutp1, Hf, Wf,
                                               stride, stride, pad, pad)
     outr2 = relu::forward(outc2)
     [outp2, Houtp2, Woutp2] = max_pool2d::forward(outr2, F2, Houtc2, Woutc2, 2, 2, 2, 2, 0, 0)
     ## layer 3:  affine3 -> relu3
     outa3 = affine::forward(outp2, W3, b3)
     outr3 = relu::forward(outa3)
     ## layer 4:  affine4 -> softmax
     outa4 = affine::forward(outr3, W4, b4)
     probs_batch = softmax::forward(outa4)

     # Store predictions
     probs[beg:end,] = probs_batch
   }
 }

 /*
  * Evaluates a convolutional net using the "LeNet" architecture.
  *
  * The probs matrix contains the class probability predictions
  * of K classes over N examples.  The targets, y, have K classes,
  * and are one-hot encoded.
  *
  * Inputs:
  *  - probs: Class probabilities, of shape (N, K)
  *  - y: Target matrix, of shape (N, K)
  *
  * Outputs:
  *  - loss: Scalar loss, of shape (1)
  *  - accuracy: Scalar accuracy, of shape (1)
  */
 eval = function(matrix[double] probs, matrix[double] y)
     return (double loss, double accuracy) {

   # Compute loss & accuracy
   loss = cross_entropy_loss::forward(probs, y)
   correct_pred = rowIndexMax(probs) == rowIndexMax(y)
   accuracy = mean(correct_pred)
 }

 # Should always use 'features' (batch features), 'labels' (batch labels),
 # 'hyperparams', 'model' as the arguments
 # and return the gradients of type list
 gradients = function(list[unknown] model,
                      list[unknown] hyperparams,
                      matrix[double] features,
                      matrix[double] labels)
           return (list[unknown] gradients) {

   C = as.integer(as.scalar(hyperparams["C"]))
   Hin = as.integer(as.scalar(hyperparams["Hin"]))
   Win = as.integer(as.scalar(hyperparams["Win"]))
   Hf = as.integer(as.scalar(hyperparams["Hf"]))
   Wf = as.integer(as.scalar(hyperparams["Wf"]))
   stride = as.integer(as.scalar(hyperparams["stride"]))
   pad = as.integer(as.scalar(hyperparams["pad"]))
   lambda = as.double(as.scalar(hyperparams["lambda"]))
   F1 = as.integer(as.scalar(hyperparams["F1"]))
   F2 = as.integer(as.scalar(hyperparams["F2"]))
   N3 = as.integer(as.scalar(hyperparams["N3"]))
   W1 = as.matrix(model[1])
   W2 = as.matrix(model[2])
   W3 = as.matrix(model[3])
   W4 = as.matrix(model[4])
   b1 = as.matrix(model[5])
   b2 = as.matrix(model[6])
   b3 = as.matrix(model[7])
   b4 = as.matrix(model[8])

   # Compute forward pass
   ## layer 1: conv1 -> relu1 -> pool1
   [outc1, Houtc1, Woutc1] = conv2d::forward(features, W1, b1, C, Hin, Win, Hf, Wf,
                                               stride, stride, pad, pad)
   outr1 = relu::forward(outc1)
   [outp1, Houtp1, Woutp1] = max_pool2d::forward(outr1, F1, Houtc1, Woutc1, 2, 2, 2, 2, 0, 0)
   ## layer 2: conv2 -> relu2 -> pool2
   [outc2, Houtc2, Woutc2] = conv2d::forward(outp1, W2, b2, F1, Houtp1, Woutp1, Hf, Wf,
                                             stride, stride, pad, pad)
   outr2 = relu::forward(outc2)
   [outp2, Houtp2, Woutp2] = max_pool2d::forward(outr2, F2, Houtc2, Woutc2, 2, 2, 2, 2, 0, 0)
   ## layer 3:  affine3 -> relu3 -> dropout
   outa3 = affine::forward(outp2, W3, b3)
   outr3 = relu::forward(outa3)
   [outd3, maskd3] = dropout::forward(outr3, 0.5, -1)
   ## layer 4:  affine4 -> softmax
   outa4 = affine::forward(outd3, W4, b4)
   probs = softmax::forward(outa4)

   # Compute loss & accuracy for training data
   loss = cross_entropy_loss::forward(probs, labels)
   accuracy = mean(rowIndexMax(probs) == rowIndexMax(labels))
   print("[+] Completed forward pass on batch: train loss: " + loss + ", train accuracy: " + accuracy)

   # Compute data backward pass
   ## loss
   dprobs = cross_entropy_loss::backward(probs, labels)
   ## layer 4:  affine4 -> softmax
   douta4 = softmax::backward(dprobs, outa4)
   [doutd3, dW4, db4] = affine::backward(douta4, outr3, W4, b4)
   ## layer 3:  affine3 -> relu3 -> dropout
   doutr3 = dropout::backward(doutd3, outr3, 0.5, maskd3)
   douta3 = relu::backward(doutr3, outa3)
   [doutp2, dW3, db3] = affine::backward(douta3, outp2, W3, b3)
   ## layer 2: conv2 -> relu2 -> pool2
   doutr2 = max_pool2d::backward(doutp2, Houtp2, Woutp2, outr2, F2, Houtc2, Woutc2, 2, 2, 2, 2, 0, 0)
   doutc2 = relu::backward(doutr2, outc2)
   [doutp1, dW2, db2] = conv2d::backward(doutc2, Houtc2, Woutc2, outp1, W2, b2, F1,
                                         Houtp1, Woutp1, Hf, Wf, stride, stride, pad, pad)
   ## layer 1: conv1 -> relu1 -> pool1
   doutr1 = max_pool2d::backward(doutp1, Houtp1, Woutp1, outr1, F1, Houtc1, Woutc1, 2, 2, 2, 2, 0, 0)
   doutc1 = relu::backward(doutr1, outc1)
   [dX_batch, dW1, db1] = conv2d::backward(doutc1, Houtc1, Woutc1, features, W1, b1, C, Hin, Win,
                                           Hf, Wf, stride, stride, pad, pad)

   # Compute regularization backward pass
   dW1_reg = l2_reg::backward(W1, lambda)
   dW2_reg = l2_reg::backward(W2, lambda)
   dW3_reg = l2_reg::backward(W3, lambda)
   dW4_reg = l2_reg::backward(W4, lambda)
   dW1 = dW1 + dW1_reg
   dW2 = dW2 + dW2_reg
   dW3 = dW3 + dW3_reg
   dW4 = dW4 + dW4_reg

   gradients = list(dW1, dW2, dW3, dW4, db1, db2, db3, db4)
 }

 # Should use the arguments named 'model', 'gradients', 'hyperparams'
 # and return always a model of type list
 aggregation = function(list[unknown] model,
                        list[unknown] hyperparams,
                        list[unknown] gradients)
     return (list[unknown] model_result) {

    W1 = as.matrix(model[1])
    W2 = as.matrix(model[2])
    W3 = as.matrix(model[3])
    W4 = as.matrix(model[4])
    b1 = as.matrix(model[5])
    b2 = as.matrix(model[6])
    b3 = as.matrix(model[7])
    b4 = as.matrix(model[8])
    dW1 = as.matrix(gradients[1])
    dW2 = as.matrix(gradients[2])
    dW3 = as.matrix(gradients[3])
    dW4 = as.matrix(gradients[4])
    db1 = as.matrix(gradients[5])
    db2 = as.matrix(gradients[6])
    db3 = as.matrix(gradients[7])
    db4 = as.matrix(gradients[8])
    vW1 = as.matrix(model[9])
    vW2 = as.matrix(model[10])
    vW3 = as.matrix(model[11])
    vW4 = as.matrix(model[12])
    vb1 = as.matrix(model[13])
    vb2 = as.matrix(model[14])
    vb3 = as.matrix(model[15])
    vb4 = as.matrix(model[16])
    learning_rate = as.double(as.scalar(hyperparams["learning_rate"]))
    mu = as.double(as.scalar(hyperparams["mu"]))

    # Optimize with SGD w/ Nesterov momentum
    [W1, vW1] = sgd_nesterov::update(W1, dW1, learning_rate, mu, vW1)
    [b1, vb1] = sgd_nesterov::update(b1, db1, learning_rate, mu, vb1)
    [W2, vW2] = sgd_nesterov::update(W2, dW2, learning_rate, mu, vW2)
    [b2, vb2] = sgd_nesterov::update(b2, db2, learning_rate, mu, vb2)
    [W3, vW3] = sgd_nesterov::update(W3, dW3, learning_rate, mu, vW3)
    [b3, vb3] = sgd_nesterov::update(b3, db3, learning_rate, mu, vb3)
    [W4, vW4] = sgd_nesterov::update(W4, dW4, learning_rate, mu, vW4)
    [b4, vb4] = sgd_nesterov::update(b4, db4, learning_rate, mu, vb4)

    model_result = list(W1, W2, W3, W4, b1, b2, b3, b4, vW1, vW2, vW3, vW4, vb1, vb2, vb3, vb4)
 }
	#-------------------------------------------------------------
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#
	#-------------------------------------------------------------

	/*
	* This file implements all needed functions to evaluate a convolutional neural network of the "LeNet" architecture
	* on different execution schemes and with different inputs, for example a federated input matrix.
	*/

	# Imports
	source("scripts/nn/layers/affine.dml") as affine
	source("scripts/nn/layers/conv2d_builtin.dml") as conv2d
	source("scripts/nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
	source("scripts/nn/layers/dropout.dml") as dropout
	source("scripts/nn/layers/l2_reg.dml") as l2_reg
	source("scripts/nn/layers/max_pool2d_builtin.dml") as max_pool2d
	source("scripts/nn/layers/relu.dml") as relu
	source("scripts/nn/layers/softmax.dml") as softmax
	source("scripts/nn/optim/sgd_nesterov.dml") as sgd_nesterov

	/*
	* Trains a convolutional net using the "LeNet" architectur single threaded the conventional way.
	*
	* The input matrix, X, has N examples, each represented as a 3D
	* volume unrolled into a single vector. The targets, Y, have K
	* classes, and are one-hot encoded.
	*
	* Inputs:
	* - X: Input data matrix, of shape (N, CHinWin)
	* - y: Target matrix, of shape (N, K)
	* - X_val: Input validation data matrix, of shape (N, CHinWin)
	* - y_val: Target validation matrix, of shape (N, K)
	* - C: Number of input channels (dimensionality of input depth)
	* - Hin: Input height
	* - Win: Input width
	* - epochs: Total number of full training loops over the full data set
	* - batch_size: Batch size
	* - learning_rate: The learning rate for the SGD
	*
	* Outputs:
	* - model_trained: List containing
	* - W1: 1st layer weights (parameters) matrix, of shape (F1, CHfWf)
	* - b1: 1st layer biases vector, of shape (F1, 1)
	* - W2: 2nd layer weights (parameters) matrix, of shape (F2, F1HfWf)
	* - b2: 2nd layer biases vector, of shape (F2, 1)
	* - W3: 3rd layer weights (parameters) matrix, of shape (F2(Hin/4)(Win/4), N3)
	* - b3: 3rd layer biases vector, of shape (1, N3)
	* - W4: 4th layer weights (parameters) matrix, of shape (N3, K)
	* - b4: 4th layer biases vector, of shape (1, K)
	*/
	train = function(matrix[double] X, matrix[double] y,
	matrix[double] X_val, matrix[double] y_val,
	int epochs, int batch_size, double eta,
	int C, int Hin, int Win,
	int seed = -1)
	return (list[unknown] model) {

	N = nrow(X)
	K = ncol(y)

	# Create network:
	## input -> conv1 -> relu1 -> pool1 -> conv2 -> relu2 -> pool2 -> affine3 -> relu3 -> affine4 -> softmax
	Hf = 5 # filter height
	Wf = 5 # filter width
	stride = 1
	pad = 2 # For same dimensions, (Hf - stride) / 2
	F1 = 32 # num conv filters in conv1
	F2 = 64 # num conv filters in conv2
	N3 = 512 # num nodes in affine3
	# Note: affine4 has K nodes, which is equal to the number of target dimensions (num classes)

	[W1, b1] = conv2d::init(F1, C, Hf, Wf, seed = seed) # inputs: (N, CHinWin)
	lseed = ifelse(seed==-1, -1, seed + 1);
	[W2, b2] = conv2d::init(F2, F1, Hf, Wf, seed = lseed) # inputs: (N, F1(Hin/2)(Win/2))
	lseed = ifelse(seed==-1, -1, seed + 2);
	[W3, b3] = affine::init(F2(Hin/2/2)(Win/2/2), N3, seed = lseed) # inputs: (N, F2(Hin/2/2)(Win/2/2))
	lseed = ifelse(seed==-1, -1, seed + 3);
	[W4, b4] = affine::init(N3, K, seed = lseed) # inputs: (N, N3)
	W4 = W4 / sqrt(2) # different initialization, since being fed into softmax, instead of relu

	# Initialize SGD w/ Nesterov momentum optimizer
	mu = 0.9 # momentum
	decay = 0.95 # learning rate decay constant
	vW1 = sgd_nesterov::init(W1); vb1 = sgd_nesterov::init(b1)
	vW2 = sgd_nesterov::init(W2); vb2 = sgd_nesterov::init(b2)
	vW3 = sgd_nesterov::init(W3); vb3 = sgd_nesterov::init(b3)
	vW4 = sgd_nesterov::init(W4); vb4 = sgd_nesterov::init(b4)

	model = list(W1, W2, W3, W4, b1, b2, b3, b4, vW1, vW2, vW3, vW4, vb1, vb2, vb3, vb4)

	# Regularization
	lambda = 5e-04

	# Create the hyper parameter list
	hyperparams = list(learning_rate=eta, mu=mu, decay=decay, C=C, Hin=Hin, Win=Win, Hf=Hf, Wf=Wf, stride=stride, pad=pad, lambda=lambda, F1=F1, F2=F2, N3=N3)
	# Calculate iterations
	iters = ceil(N / batch_size)

	for (e in 1:epochs) {
	for(i in 1:iters) {
	# Get next batch
	beg = ((i-1) * batch_size) %% N + 1
	end = min(N, beg + batch_size - 1)
	X_batch = X[beg:end,]
	y_batch = y[beg:end,]

	gradients_list = gradients(model, hyperparams, X_batch, y_batch)
	model = aggregation(model, hyperparams, gradients_list)
	}
	}
	}

	/*
	* Trains a convolutional net using the "LeNet" architecture using a parameter server with specified properties.
	*
	* The input matrix, X, has N examples, each represented as a 3D
	* volume unrolled into a single vector. The targets, Y, have K
	* classes, and are one-hot encoded.
	*
	* Inputs:
	* - X: Input data matrix, of shape (N, CHinWin)
	* - Y: Target matrix, of shape (N, K)
	* - X_val: Input validation data matrix, of shape (N, CHinWin)
	* - Y_val: Target validation matrix, of shape (N, K)
	* - C: Number of input channels (dimensionality of input depth)
	* - Hin: Input height
	* - Win: Input width
	* - epochs: Total number of full training loops over the full data set
	* - batch_size: Batch size
	* - learning_rate: The learning rate for the SGD
	* - workers: Number of workers to create
	* - utype: parameter server framework to use
	* - scheme: update schema
	* - mode: local or distributed
	*
	* Outputs:
	* - model_trained: List containing
	* - W1: 1st layer weights (parameters) matrix, of shape (F1, CHfWf)
	* - b1: 1st layer biases vector, of shape (F1, 1)
	* - W2: 2nd layer weights (parameters) matrix, of shape (F2, F1HfWf)
	* - b2: 2nd layer biases vector, of shape (F2, 1)
	* - W3: 3rd layer weights (parameters) matrix, of shape (F2(Hin/4)(Win/4), N3)
	* - b3: 3rd layer biases vector, of shape (1, N3)
	* - W4: 4th layer weights (parameters) matrix, of shape (N3, K)
	* - b4: 4th layer biases vector, of shape (1, K)
	*/
	train_paramserv = function(matrix[double] X, matrix[double] y,
	matrix[double] X_val, matrix[double] y_val,
	int num_workers, int epochs, string utype, string freq, int batch_size, string scheme, string runtime_balancing,
	double eta, int C, int Hin, int Win,
	int seed = -1)
	return (list[unknown] model) {

	N = nrow(X)
	K = ncol(y)

	# Create network:
	## input -> conv1 -> relu1 -> pool1 -> conv2 -> relu2 -> pool2 -> affine3 -> relu3 -> affine4 -> softmax
	Hf = 5 # filter height
	Wf = 5 # filter width
	stride = 1
	pad = 2 # For same dimensions, (Hf - stride) / 2
	F1 = 32 # num conv filters in conv1
	F2 = 64 # num conv filters in conv2
	N3 = 512 # num nodes in affine3
	# Note: affine4 has K nodes, which is equal to the number of target dimensions (num classes)

	[W1, b1] = conv2d::init(F1, C, Hf, Wf, seed = seed) # inputs: (N, CHinWin)
	lseed = ifelse(seed==-1, -1, seed + 1);
	[W2, b2] = conv2d::init(F2, F1, Hf, Wf, seed = lseed) # inputs: (N, F1(Hin/2)(Win/2))
	lseed = ifelse(seed==-1, -1, seed + 2);
	[W3, b3] = affine::init(F2(Hin/2/2)(Win/2/2), N3, seed = lseed) # inputs: (N, F2(Hin/2/2)(Win/2/2))
	lseed = ifelse(seed==-1, -1, seed + 3);
	[W4, b4] = affine::init(N3, K, seed = lseed) # inputs: (N, N3)
	W4 = W4 / sqrt(2) # different initialization, since being fed into softmax, instead of relu

	# Initialize SGD w/ Nesterov momentum optimizer
	learning_rate = eta # learning rate
	mu = 0.9 # momentum
	decay = 0.95 # learning rate decay constant
	vW1 = sgd_nesterov::init(W1); vb1 = sgd_nesterov::init(b1)
	vW2 = sgd_nesterov::init(W2); vb2 = sgd_nesterov::init(b2)
	vW3 = sgd_nesterov::init(W3); vb3 = sgd_nesterov::init(b3)
	vW4 = sgd_nesterov::init(W4); vb4 = sgd_nesterov::init(b4)
	# Regularization
	lambda = 5e-04
	# Create the model list
	model = list(W1, W2, W3, W4, b1, b2, b3, b4, vW1, vW2, vW3, vW4, vb1, vb2, vb3, vb4)
	# Create the hyper parameter list
	hyperparams = list(learning_rate=eta, mu=mu, decay=decay, C=C, Hin=Hin, Win=Win, Hf=Hf, Wf=Wf, stride=stride, pad=pad, lambda=lambda, F1=F1, F2=F2, N3=N3)

	# Use paramserv function
	model = paramserv(model=model, features=X, labels=y, val_features=X_val, val_labels=y_val,
	upd="./src/test/scripts/functions/federated/paramserv/CNN.dml::gradients",
	agg="./src/test/scripts/functions/federated/paramserv/CNN.dml::aggregation",
	k=num_workers, utype=utype, freq=freq, epochs=epochs, batchsize=batch_size,
	scheme=scheme, runtime_balancing=runtime_balancing, hyperparams=hyperparams)
	}

	/*
	* Computes the class probability predictions of a convolutional
	* net using the "LeNet" architecture.
	*
	* The input matrix, X, has N examples, each represented as a 3D
	* volume unrolled into a single vector.
	*
	* Inputs:
	* - X: Input data matrix, of shape (N, CHinWin)
	* - C: Number of input channels (dimensionality of input depth)
	* - Hin: Input height
	* - Win: Input width
	* - batch_size: Batch size
	* - model: List containing
	* - W1: 1st layer weights (parameters) matrix, of shape (F1, CHfWf)
	* - b1: 1st layer biases vector, of shape (F1, 1)
	* - W2: 2nd layer weights (parameters) matrix, of shape (F2, F1HfWf)
	* - b2: 2nd layer biases vector, of shape (F2, 1)
	* - W3: 3rd layer weights (parameters) matrix, of shape (F2(Hin/4)(Win/4), N3)
	* - b3: 3rd layer biases vector, of shape (1, N3)
	* - W4: 4th layer weights (parameters) matrix, of shape (N3, K)
	* - b4: 4th layer biases vector, of shape (1, K)
	*
	* Outputs:
	* - probs: Class probabilities, of shape (N, K)
	*/
	predict = function(matrix[double] X, int C, int Hin, int Win, int batch_size, list[unknown] model)
	return (matrix[double] probs) {

	W1 = as.matrix(model[1])
	W2 = as.matrix(model[2])
	W3 = as.matrix(model[3])
	W4 = as.matrix(model[4])
	b1 = as.matrix(model[5])
	b2 = as.matrix(model[6])
	b3 = as.matrix(model[7])
	b4 = as.matrix(model[8])
	N = nrow(X)

	# Network:
	## input -> conv1 -> relu1 -> pool1 -> conv2 -> relu2 -> pool2 -> affine3 -> relu3 -> affine4 -> softmax
	Hf = 5 # filter height
	Wf = 5 # filter width
	stride = 1
	pad = 2 # For same dimensions, (Hf - stride) / 2
	F1 = nrow(W1) # num conv filters in conv1
	F2 = nrow(W2) # num conv filters in conv2
	N3 = ncol(W3) # num nodes in affine3
	K = ncol(W4) # num nodes in affine4, equal to number of target dimensions (num classes)

	# Compute predictions over mini-batches
	probs = matrix(0, rows=N, cols=K)
	iters = ceil(N / batch_size)
	parfor(i in 1:iters, check=0) {
	# Get next batch
	beg = ((i-1) * batch_size) %% N + 1
	end = min(N, beg + batch_size - 1)
	X_batch = X[beg:end,]

	# Compute forward pass
	## layer 1: conv1 -> relu1 -> pool1
	[outc1, Houtc1, Woutc1] = conv2d::forward(X_batch, W1, b1, C, Hin, Win, Hf, Wf, stride, stride,
	pad, pad)
	outr1 = relu::forward(outc1)
	[outp1, Houtp1, Woutp1] = max_pool2d::forward(outr1, F1, Houtc1, Woutc1, 2, 2, 2, 2, 0, 0)
	## layer 2: conv2 -> relu2 -> pool2
	[outc2, Houtc2, Woutc2] = conv2d::forward(outp1, W2, b2, F1, Houtp1, Woutp1, Hf, Wf,
	stride, stride, pad, pad)
	outr2 = relu::forward(outc2)
	[outp2, Houtp2, Woutp2] = max_pool2d::forward(outr2, F2, Houtc2, Woutc2, 2, 2, 2, 2, 0, 0)
	## layer 3: affine3 -> relu3
	outa3 = affine::forward(outp2, W3, b3)
	outr3 = relu::forward(outa3)
	## layer 4: affine4 -> softmax
	outa4 = affine::forward(outr3, W4, b4)
	probs_batch = softmax::forward(outa4)

	# Store predictions
	probs[beg:end,] = probs_batch
	}
	}

	/*
	* Evaluates a convolutional net using the "LeNet" architecture.
	*
	* The probs matrix contains the class probability predictions
	* of K classes over N examples. The targets, y, have K classes,
	* and are one-hot encoded.
	*
	* Inputs:
	* - probs: Class probabilities, of shape (N, K)
	* - y: Target matrix, of shape (N, K)
	*
	* Outputs:
	* - loss: Scalar loss, of shape (1)
	* - accuracy: Scalar accuracy, of shape (1)
	*/
	eval = function(matrix[double] probs, matrix[double] y)
	return (double loss, double accuracy) {

	# Compute loss & accuracy
	loss = cross_entropy_loss::forward(probs, y)
	correct_pred = rowIndexMax(probs) == rowIndexMax(y)
	accuracy = mean(correct_pred)
	}

	# Should always use 'features' (batch features), 'labels' (batch labels),
	# 'hyperparams', 'model' as the arguments
	# and return the gradients of type list
	gradients = function(list[unknown] model,
	list[unknown] hyperparams,
	matrix[double] features,
	matrix[double] labels)
	return (list[unknown] gradients) {

	C = as.integer(as.scalar(hyperparams["C"]))
	Hin = as.integer(as.scalar(hyperparams["Hin"]))
	Win = as.integer(as.scalar(hyperparams["Win"]))
	Hf = as.integer(as.scalar(hyperparams["Hf"]))
	Wf = as.integer(as.scalar(hyperparams["Wf"]))
	stride = as.integer(as.scalar(hyperparams["stride"]))
	pad = as.integer(as.scalar(hyperparams["pad"]))
	lambda = as.double(as.scalar(hyperparams["lambda"]))
	F1 = as.integer(as.scalar(hyperparams["F1"]))
	F2 = as.integer(as.scalar(hyperparams["F2"]))
	N3 = as.integer(as.scalar(hyperparams["N3"]))
	W1 = as.matrix(model[1])
	W2 = as.matrix(model[2])
	W3 = as.matrix(model[3])
	W4 = as.matrix(model[4])
	b1 = as.matrix(model[5])
	b2 = as.matrix(model[6])
	b3 = as.matrix(model[7])
	b4 = as.matrix(model[8])

	# Compute forward pass
	## layer 1: conv1 -> relu1 -> pool1
	[outc1, Houtc1, Woutc1] = conv2d::forward(features, W1, b1, C, Hin, Win, Hf, Wf,
	stride, stride, pad, pad)
	outr1 = relu::forward(outc1)
	[outp1, Houtp1, Woutp1] = max_pool2d::forward(outr1, F1, Houtc1, Woutc1, 2, 2, 2, 2, 0, 0)
	## layer 2: conv2 -> relu2 -> pool2
	[outc2, Houtc2, Woutc2] = conv2d::forward(outp1, W2, b2, F1, Houtp1, Woutp1, Hf, Wf,
	stride, stride, pad, pad)
	outr2 = relu::forward(outc2)
	[outp2, Houtp2, Woutp2] = max_pool2d::forward(outr2, F2, Houtc2, Woutc2, 2, 2, 2, 2, 0, 0)
	## layer 3: affine3 -> relu3 -> dropout
	outa3 = affine::forward(outp2, W3, b3)
	outr3 = relu::forward(outa3)
	[outd3, maskd3] = dropout::forward(outr3, 0.5, -1)
	## layer 4: affine4 -> softmax
	outa4 = affine::forward(outd3, W4, b4)
	probs = softmax::forward(outa4)

	# Compute loss & accuracy for training data
	loss = cross_entropy_loss::forward(probs, labels)
	accuracy = mean(rowIndexMax(probs) == rowIndexMax(labels))
	print("[+] Completed forward pass on batch: train loss: " + loss + ", train accuracy: " + accuracy)

	# Compute data backward pass
	## loss
	dprobs = cross_entropy_loss::backward(probs, labels)
	## layer 4: affine4 -> softmax
	douta4 = softmax::backward(dprobs, outa4)
	[doutd3, dW4, db4] = affine::backward(douta4, outr3, W4, b4)
	## layer 3: affine3 -> relu3 -> dropout
	doutr3 = dropout::backward(doutd3, outr3, 0.5, maskd3)
	douta3 = relu::backward(doutr3, outa3)
	[doutp2, dW3, db3] = affine::backward(douta3, outp2, W3, b3)
	## layer 2: conv2 -> relu2 -> pool2
	doutr2 = max_pool2d::backward(doutp2, Houtp2, Woutp2, outr2, F2, Houtc2, Woutc2, 2, 2, 2, 2, 0, 0)
	doutc2 = relu::backward(doutr2, outc2)
	[doutp1, dW2, db2] = conv2d::backward(doutc2, Houtc2, Woutc2, outp1, W2, b2, F1,
	Houtp1, Woutp1, Hf, Wf, stride, stride, pad, pad)
	## layer 1: conv1 -> relu1 -> pool1
	doutr1 = max_pool2d::backward(doutp1, Houtp1, Woutp1, outr1, F1, Houtc1, Woutc1, 2, 2, 2, 2, 0, 0)
	doutc1 = relu::backward(doutr1, outc1)
	[dX_batch, dW1, db1] = conv2d::backward(doutc1, Houtc1, Woutc1, features, W1, b1, C, Hin, Win,
	Hf, Wf, stride, stride, pad, pad)

	# Compute regularization backward pass
	dW1_reg = l2_reg::backward(W1, lambda)
	dW2_reg = l2_reg::backward(W2, lambda)
	dW3_reg = l2_reg::backward(W3, lambda)
	dW4_reg = l2_reg::backward(W4, lambda)
	dW1 = dW1 + dW1_reg
	dW2 = dW2 + dW2_reg
	dW3 = dW3 + dW3_reg
	dW4 = dW4 + dW4_reg

	gradients = list(dW1, dW2, dW3, dW4, db1, db2, db3, db4)
	}

	# Should use the arguments named 'model', 'gradients', 'hyperparams'
	# and return always a model of type list
	aggregation = function(list[unknown] model,
	list[unknown] hyperparams,
	list[unknown] gradients)
	return (list[unknown] model_result) {

	W1 = as.matrix(model[1])
	W2 = as.matrix(model[2])
	W3 = as.matrix(model[3])
	W4 = as.matrix(model[4])
	b1 = as.matrix(model[5])
	b2 = as.matrix(model[6])
	b3 = as.matrix(model[7])
	b4 = as.matrix(model[8])
	dW1 = as.matrix(gradients[1])
	dW2 = as.matrix(gradients[2])
	dW3 = as.matrix(gradients[3])
	dW4 = as.matrix(gradients[4])
	db1 = as.matrix(gradients[5])
	db2 = as.matrix(gradients[6])
	db3 = as.matrix(gradients[7])
	db4 = as.matrix(gradients[8])
	vW1 = as.matrix(model[9])
	vW2 = as.matrix(model[10])
	vW3 = as.matrix(model[11])
	vW4 = as.matrix(model[12])
	vb1 = as.matrix(model[13])
	vb2 = as.matrix(model[14])
	vb3 = as.matrix(model[15])
	vb4 = as.matrix(model[16])
	learning_rate = as.double(as.scalar(hyperparams["learning_rate"]))
	mu = as.double(as.scalar(hyperparams["mu"]))

	# Optimize with SGD w/ Nesterov momentum
	[W1, vW1] = sgd_nesterov::update(W1, dW1, learning_rate, mu, vW1)
	[b1, vb1] = sgd_nesterov::update(b1, db1, learning_rate, mu, vb1)
	[W2, vW2] = sgd_nesterov::update(W2, dW2, learning_rate, mu, vW2)
	[b2, vb2] = sgd_nesterov::update(b2, db2, learning_rate, mu, vb2)
	[W3, vW3] = sgd_nesterov::update(W3, dW3, learning_rate, mu, vW3)
	[b3, vb3] = sgd_nesterov::update(b3, db3, learning_rate, mu, vb3)
	[W4, vW4] = sgd_nesterov::update(W4, dW4, learning_rate, mu, vW4)
	[b4, vb4] = sgd_nesterov::update(b4, db4, learning_rate, mu, vb4)

	model_result = list(W1, W2, W3, W4, b1, b2, b3, b4, vW1, vW2, vW3, vW4, vb1, vb2, vb3, vb4)
	}