* This file implements all needed functions to evaluate a simple feed forward neural network
* on different execution schemes and with different inputs, for example a federated input matrix.
# Imports
source("nn/layers/affine.dml") as affine
source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
source("nn/layers/relu.dml") as relu
source("nn/layers/softmax.dml") as softmax
source("nn/optim/sgd.dml") as sgd
* Trains a simple feed forward neural network with two hidden layers single threaded the conventional way.
* The input matrix has one example per row (N) and D features.
* The targets, y, have K classes, and are one-hot encoded.
* Inputs:
* - X: Input data matrix of shape (N, D)
* - y: Target matrix of shape (N, K)
* - X_val: Input validation data matrix of shape (N_val, D)
* - y_val: Targed validation matrix of shape (N_val, K)
* - epochs: Total number of full training loops over the full data set
* - batch_size: Batch size
* - learning_rate: The learning rate for the SGD
* Outputs:
* - model_trained: List containing
* - W1: 1st layer weights (parameters) matrix, of shape (D, 200)
* - b1: 1st layer biases vector, of shape (200, 1)
* - W2: 2nd layer weights (parameters) matrix, of shape (200, 200)
* - b2: 2nd layer biases vector, of shape (200, 1)
* - W3: 3rd layer weights (parameters) matrix, of shape (200, K)
* - b3: 3rd layer biases vector, of shape (K, 1)
train = function(matrix[double] X, matrix[double] y,
matrix[double] X_val, matrix[double] y_val,
int epochs, int batch_size, double learning_rate)
return (list[unknown] model_trained) {
N = nrow(X) # num examples
D = ncol(X) # num features
K = ncol(y) # num classes
# Create the network:
## input -> affine1 -> relu1 -> affine2 -> relu2 -> affine3 -> softmax
[W1, b1] = affine::init(D, 200)
[W2, b2] = affine::init(200, 200)
[W3, b3] = affine::init(200, K)
W3 = W3 / sqrt(2) # different initialization, since being fed into softmax, instead of relu
# Create the hyper parameter list
hyperparams = list(learning_rate=learning_rate)
# Calculate iterations
iters = ceil(N / batch_size)
print_interval = floor(iters / 25)
print("[+] Starting optimization")
print("[+] Learning rate: " + learning_rate)
print("[+] Batch size: " + batch_size)
print("[+] Iterations per epoch: " + iters + "\n")
for (e in 1:epochs) {
print("[+] Starting epoch: " + e)
for(i in 1:iters) {
# Create the model list
model_list = list(W1, W2, W3, b1, b2, b3)
# Get next batch
beg = ((i-1) * batch_size) %% N + 1
end = min(N, beg + batch_size - 1)
X_batch = X[beg:end,]
y_batch = y[beg:end,]
gradients_list = gradients(model_list, hyperparams, X_batch, y_batch)
model_updated = aggregation(model_list, hyperparams, gradients_list)
W1 = as.matrix(model_updated[1])
W2 = as.matrix(model_updated[2])
W3 = as.matrix(model_updated[3])
b1 = as.matrix(model_updated[4])
b2 = as.matrix(model_updated[5])
b3 = as.matrix(model_updated[6])
if((i %% print_interval) == 0) {
model_trained = list(W1, W2, W3, b1, b2, b3)
* Trains a simple feed forward neural network with two hidden layers
* using a parameter server with specified properties.
* The input matrix has one example per row (N) and D features.
* The targets, y, have K classes, and are one-hot encoded.
* Inputs:
* - X: Input data matrix of shape (N, D)
* - y: Target matrix of shape (N, K)
* - X_val: Input validation data matrix of shape (N_val, D)
* - y_val: Targed validation matrix of shape (N_val, K)
* - epochs: Total number of full training loops over the full data set
* - batch_size: Batch size
* - learning_rate: The learning rate for the SGD
* - workers: Number of workers to create
* - utype: parameter server framework to use
* - scheme: update schema
* - mode: local or distributed
* Outputs:
* - model_trained: List containing
* - W1: 1st layer weights (parameters) matrix, of shape (D, 200)
* - b1: 1st layer biases vector, of shape (200, 1)
* - W2: 2nd layer weights (parameters) matrix, of shape (200, 200)
* - b2: 2nd layer biases vector, of shape (200, 1)
* - W3: 3rd layer weights (parameters) matrix, of shape (200, K)
* - b3: 3rd layer biases vector, of shape (K, 1)
train_paramserv = function(matrix[double] X, matrix[double] y,
matrix[double] X_val, matrix[double] y_val,
int epochs, int workers,
string utype, string freq, int batch_size, string scheme, string mode, double learning_rate)
return (list[unknown] model_trained) {
N = nrow(X) # num examples
D = ncol(X) # num features
K = ncol(y) # num classes
# Create the network:
## input -> affine1 -> relu1 -> affine2 -> relu2 -> affine3 -> softmax
[W1, b1] = affine::init(D, 200)
[W2, b2] = affine::init(200, 200)
[W3, b3] = affine::init(200, K)
# Create the model list
model_list = list(W1, W2, W3, b1, b2, b3)
# Create the hyper parameter list
params = list(learning_rate=learning_rate)
# Use paramserv function
model_trained = paramserv(model=model_list, features=X, labels=y, val_features=X_val, val_labels=y_val, upd="./src/test/scripts/functions/federated/paramserv/TwoNN.dml::gradients", agg="./src/test/scripts/functions/federated/paramserv/TwoNN.dml::aggregation", mode=mode, utype=utype, freq=freq, epochs=epochs, batchsize=batch_size, k=workers, scheme=scheme, hyperparams=params, checkpointing="NONE")
* Computes the class probability predictions of a simple feed forward neural network.
* Inputs:
* - X: The input data matrix of shape (N, D)
* - model: List containing
* - W1: 1st layer weights (parameters) matrix, of shape (D, 200)
* - b1: 1st layer biases vector, of shape (200, 1)
* - W2: 2nd layer weights (parameters) matrix, of shape (200, 200)
* - b2: 2nd layer biases vector, of shape (200, 1)
* - W3: 3rd layer weights (parameters) matrix, of shape (200, K)
* - b3: 3rd layer biases vector, of shape (K, 1)
* Outputs:
* - probs: Class probabilities, of shape (N, K)
predict = function(matrix[double] X,
list[unknown] model)
return (matrix[double] probs) {
W1 = as.matrix(model[1])
W2 = as.matrix(model[2])
W3 = as.matrix(model[3])
b1 = as.matrix(model[4])
b2 = as.matrix(model[5])
b3 = as.matrix(model[6])
out1relu = relu::forward(affine::forward(X, W1, b1))
out2relu = relu::forward(affine::forward(out1relu, W2, b2))
probs = softmax::forward(affine::forward(out2relu, W3, b3))
* Evaluates a simple feed forward neural network.
* The probs matrix contains the class probability predictions
* of K classes over N examples. The targets, y, have K classes,
* and are one-hot encoded.
* Inputs:
* - probs: Class probabilities, of shape (N, K).
* - y: Target matrix, of shape (N, K).
* Outputs:
* - loss: Scalar loss, of shape (1).
* - accuracy: Scalar accuracy, of shape (1).
eval = function(matrix[double] probs, matrix[double] y)
return (double loss, double accuracy) {
# Compute loss & accuracy
loss = cross_entropy_loss::forward(probs, y)
correct_pred = rowIndexMax(probs) == rowIndexMax(y)
accuracy = mean(correct_pred)
# Should always use 'features' (batch features), 'labels' (batch labels),
# 'hyperparams', 'model' as the arguments
# and return the gradients of type list
gradients = function(list[unknown] model,
list[unknown] hyperparams,
matrix[double] features,
matrix[double] labels)
return (list[unknown] gradients) {
W1 = as.matrix(model[1])
W2 = as.matrix(model[2])
W3 = as.matrix(model[3])
b1 = as.matrix(model[4])
b2 = as.matrix(model[5])
b3 = as.matrix(model[6])
# Compute forward pass
## input -> affine1 -> relu1 -> affine2 -> relu2 -> affine3 -> softmax
out1 = affine::forward(features, W1, b1)
out1relu = relu::forward(out1)
out2 = affine::forward(out1relu, W2, b2)
out2relu = relu::forward(out2)
out3 = affine::forward(out2relu, W3, b3)
probs = softmax::forward(out3)
# Compute loss & accuracy for training data
loss = cross_entropy_loss::forward(probs, labels)
accuracy = mean(rowIndexMax(probs) == rowIndexMax(labels))
print("[+] Completed forward pass on batch: train loss: " + loss + ", train accuracy: " + accuracy)
# Compute data backward pass
dprobs = cross_entropy_loss::backward(probs, labels)
dout3 = softmax::backward(dprobs, out3)
[dout2relu, dW3, db3] = affine::backward(dout3, out2relu, W3, b3)
dout2 = relu::backward(dout2relu, out2)
[dout1relu, dW2, db2] = affine::backward(dout2, out1relu, W2, b2)
dout1 = relu::backward(dout1relu, out1)
[dfeatures, dW1, db1] = affine::backward(dout1, features, W1, b1)
gradients = list(dW1, dW2, dW3, db1, db2, db3)
# Should use the arguments named 'model', 'gradients', 'hyperparams'
# and return always a model of type list
aggregation = function(list[unknown] model,
list[unknown] hyperparams,
list[unknown] gradients)
return (list[unknown] model_result) {
W1 = as.matrix(model[1])
W2 = as.matrix(model[2])
W3 = as.matrix(model[3])
b1 = as.matrix(model[4])
b2 = as.matrix(model[5])
b3 = as.matrix(model[6])
dW1 = as.matrix(gradients[1])
dW2 = as.matrix(gradients[2])
dW3 = as.matrix(gradients[3])
db1 = as.matrix(gradients[4])
db2 = as.matrix(gradients[5])
db3 = as.matrix(gradients[6])
learning_rate = as.double(as.scalar(hyperparams["learning_rate"]))
# Optimize with SGD
W3 = sgd::update(W3, dW3, learning_rate)
b3 = sgd::update(b3, db3, learning_rate)
W2 = sgd::update(W2, dW2, learning_rate)
b2 = sgd::update(b2, db2, learning_rate)
W1 = sgd::update(W1, dW1, learning_rate)
b1 = sgd::update(b1, db1, learning_rate)
model_result = list(W1, W2, W3, b1, b2, b3)