blob: 69c7e7604422dc2c048390166d387858dab53b72 [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
/*
* This file implements all needed functions to evaluate a convolutional neural network of the "LeNet" architecture
* on different execution schemes and with different inputs, for example a federated input matrix.
*/
# Imports
source("scripts/nn/layers/affine.dml") as affine
source("scripts/nn/layers/conv2d_builtin.dml") as conv2d
source("scripts/nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
source("scripts/nn/layers/dropout.dml") as dropout
source("scripts/nn/layers/l2_reg.dml") as l2_reg
source("scripts/nn/layers/max_pool2d_builtin.dml") as max_pool2d
source("scripts/nn/layers/relu.dml") as relu
source("scripts/nn/layers/softmax.dml") as softmax
source("scripts/nn/optim/sgd_nesterov.dml") as sgd_nesterov
/*
* Trains a convolutional net using the "LeNet" architectur single threaded the conventional way.
*
* The input matrix, X, has N examples, each represented as a 3D
* volume unrolled into a single vector. The targets, Y, have K
* classes, and are one-hot encoded.
*
* Inputs:
* - X: Input data matrix, of shape (N, C*Hin*Win)
* - y: Target matrix, of shape (N, K)
* - X_val: Input validation data matrix, of shape (N, C*Hin*Win)
* - y_val: Target validation matrix, of shape (N, K)
* - C: Number of input channels (dimensionality of input depth)
* - Hin: Input height
* - Win: Input width
* - epochs: Total number of full training loops over the full data set
* - batch_size: Batch size
* - learning_rate: The learning rate for the SGD
*
* Outputs:
* - model_trained: List containing
* - W1: 1st layer weights (parameters) matrix, of shape (F1, C*Hf*Wf)
* - b1: 1st layer biases vector, of shape (F1, 1)
* - W2: 2nd layer weights (parameters) matrix, of shape (F2, F1*Hf*Wf)
* - b2: 2nd layer biases vector, of shape (F2, 1)
* - W3: 3rd layer weights (parameters) matrix, of shape (F2*(Hin/4)*(Win/4), N3)
* - b3: 3rd layer biases vector, of shape (1, N3)
* - W4: 4th layer weights (parameters) matrix, of shape (N3, K)
* - b4: 4th layer biases vector, of shape (1, K)
*/
train = function(matrix[double] X, matrix[double] y,
matrix[double] X_val, matrix[double] y_val,
int epochs, int batch_size, double eta,
int C, int Hin, int Win,
int seed = -1)
return (list[unknown] model) {
N = nrow(X)
K = ncol(y)
# Create network:
## input -> conv1 -> relu1 -> pool1 -> conv2 -> relu2 -> pool2 -> affine3 -> relu3 -> affine4 -> softmax
Hf = 5 # filter height
Wf = 5 # filter width
stride = 1
pad = 2 # For same dimensions, (Hf - stride) / 2
F1 = 32 # num conv filters in conv1
F2 = 64 # num conv filters in conv2
N3 = 512 # num nodes in affine3
# Note: affine4 has K nodes, which is equal to the number of target dimensions (num classes)
[W1, b1] = conv2d::init(F1, C, Hf, Wf, seed = seed) # inputs: (N, C*Hin*Win)
lseed = ifelse(seed==-1, -1, seed + 1);
[W2, b2] = conv2d::init(F2, F1, Hf, Wf, seed = lseed) # inputs: (N, F1*(Hin/2)*(Win/2))
lseed = ifelse(seed==-1, -1, seed + 2);
[W3, b3] = affine::init(F2*(Hin/2/2)*(Win/2/2), N3, seed = lseed) # inputs: (N, F2*(Hin/2/2)*(Win/2/2))
lseed = ifelse(seed==-1, -1, seed + 3);
[W4, b4] = affine::init(N3, K, seed = lseed) # inputs: (N, N3)
W4 = W4 / sqrt(2) # different initialization, since being fed into softmax, instead of relu
# Initialize SGD w/ Nesterov momentum optimizer
mu = 0.9 # momentum
decay = 0.95 # learning rate decay constant
vW1 = sgd_nesterov::init(W1); vb1 = sgd_nesterov::init(b1)
vW2 = sgd_nesterov::init(W2); vb2 = sgd_nesterov::init(b2)
vW3 = sgd_nesterov::init(W3); vb3 = sgd_nesterov::init(b3)
vW4 = sgd_nesterov::init(W4); vb4 = sgd_nesterov::init(b4)
model = list(W1, W2, W3, W4, b1, b2, b3, b4, vW1, vW2, vW3, vW4, vb1, vb2, vb3, vb4)
# Regularization
lambda = 5e-04
# Create the hyper parameter list
hyperparams = list(learning_rate=eta, mu=mu, decay=decay, C=C, Hin=Hin, Win=Win, Hf=Hf, Wf=Wf, stride=stride, pad=pad, lambda=lambda, F1=F1, F2=F2, N3=N3)
# Calculate iterations
iters = ceil(N / batch_size)
for (e in 1:epochs) {
for(i in 1:iters) {
# Get next batch
beg = ((i-1) * batch_size) %% N + 1
end = min(N, beg + batch_size - 1)
X_batch = X[beg:end,]
y_batch = y[beg:end,]
gradients_list = gradients(model, hyperparams, X_batch, y_batch)
model = aggregation(model, hyperparams, gradients_list)
}
}
}
/*
* Trains a convolutional net using the "LeNet" architecture using a parameter server with specified properties.
*
* The input matrix, X, has N examples, each represented as a 3D
* volume unrolled into a single vector. The targets, Y, have K
* classes, and are one-hot encoded.
*
* Inputs:
* - X: Input data matrix, of shape (N, C*Hin*Win)
* - Y: Target matrix, of shape (N, K)
* - X_val: Input validation data matrix, of shape (N, C*Hin*Win)
* - Y_val: Target validation matrix, of shape (N, K)
* - C: Number of input channels (dimensionality of input depth)
* - Hin: Input height
* - Win: Input width
* - epochs: Total number of full training loops over the full data set
* - batch_size: Batch size
* - learning_rate: The learning rate for the SGD
* - workers: Number of workers to create
* - utype: parameter server framework to use
* - scheme: update schema
* - mode: local or distributed
*
* Outputs:
* - model_trained: List containing
* - W1: 1st layer weights (parameters) matrix, of shape (F1, C*Hf*Wf)
* - b1: 1st layer biases vector, of shape (F1, 1)
* - W2: 2nd layer weights (parameters) matrix, of shape (F2, F1*Hf*Wf)
* - b2: 2nd layer biases vector, of shape (F2, 1)
* - W3: 3rd layer weights (parameters) matrix, of shape (F2*(Hin/4)*(Win/4), N3)
* - b3: 3rd layer biases vector, of shape (1, N3)
* - W4: 4th layer weights (parameters) matrix, of shape (N3, K)
* - b4: 4th layer biases vector, of shape (1, K)
*/
train_paramserv = function(matrix[double] X, matrix[double] y,
matrix[double] X_val, matrix[double] y_val,
int num_workers, int epochs, string utype, string freq, int batch_size, string scheme, string runtime_balancing,
double eta, int C, int Hin, int Win,
int seed = -1)
return (list[unknown] model) {
N = nrow(X)
K = ncol(y)
# Create network:
## input -> conv1 -> relu1 -> pool1 -> conv2 -> relu2 -> pool2 -> affine3 -> relu3 -> affine4 -> softmax
Hf = 5 # filter height
Wf = 5 # filter width
stride = 1
pad = 2 # For same dimensions, (Hf - stride) / 2
F1 = 32 # num conv filters in conv1
F2 = 64 # num conv filters in conv2
N3 = 512 # num nodes in affine3
# Note: affine4 has K nodes, which is equal to the number of target dimensions (num classes)
[W1, b1] = conv2d::init(F1, C, Hf, Wf, seed = seed) # inputs: (N, C*Hin*Win)
lseed = ifelse(seed==-1, -1, seed + 1);
[W2, b2] = conv2d::init(F2, F1, Hf, Wf, seed = lseed) # inputs: (N, F1*(Hin/2)*(Win/2))
lseed = ifelse(seed==-1, -1, seed + 2);
[W3, b3] = affine::init(F2*(Hin/2/2)*(Win/2/2), N3, seed = lseed) # inputs: (N, F2*(Hin/2/2)*(Win/2/2))
lseed = ifelse(seed==-1, -1, seed + 3);
[W4, b4] = affine::init(N3, K, seed = lseed) # inputs: (N, N3)
W4 = W4 / sqrt(2) # different initialization, since being fed into softmax, instead of relu
# Initialize SGD w/ Nesterov momentum optimizer
learning_rate = eta # learning rate
mu = 0.9 # momentum
decay = 0.95 # learning rate decay constant
vW1 = sgd_nesterov::init(W1); vb1 = sgd_nesterov::init(b1)
vW2 = sgd_nesterov::init(W2); vb2 = sgd_nesterov::init(b2)
vW3 = sgd_nesterov::init(W3); vb3 = sgd_nesterov::init(b3)
vW4 = sgd_nesterov::init(W4); vb4 = sgd_nesterov::init(b4)
# Regularization
lambda = 5e-04
# Create the model list
model = list(W1, W2, W3, W4, b1, b2, b3, b4, vW1, vW2, vW3, vW4, vb1, vb2, vb3, vb4)
# Create the hyper parameter list
hyperparams = list(learning_rate=eta, mu=mu, decay=decay, C=C, Hin=Hin, Win=Win, Hf=Hf, Wf=Wf, stride=stride, pad=pad, lambda=lambda, F1=F1, F2=F2, N3=N3)
# Use paramserv function
model = paramserv(model=model, features=X, labels=y, val_features=X_val, val_labels=y_val,
upd="./src/test/scripts/functions/federated/paramserv/CNN.dml::gradients",
agg="./src/test/scripts/functions/federated/paramserv/CNN.dml::aggregation",
k=num_workers, utype=utype, freq=freq, epochs=epochs, batchsize=batch_size,
scheme=scheme, runtime_balancing=runtime_balancing, hyperparams=hyperparams)
}
/*
* Computes the class probability predictions of a convolutional
* net using the "LeNet" architecture.
*
* The input matrix, X, has N examples, each represented as a 3D
* volume unrolled into a single vector.
*
* Inputs:
* - X: Input data matrix, of shape (N, C*Hin*Win)
* - C: Number of input channels (dimensionality of input depth)
* - Hin: Input height
* - Win: Input width
* - batch_size: Batch size
* - model: List containing
* - W1: 1st layer weights (parameters) matrix, of shape (F1, C*Hf*Wf)
* - b1: 1st layer biases vector, of shape (F1, 1)
* - W2: 2nd layer weights (parameters) matrix, of shape (F2, F1*Hf*Wf)
* - b2: 2nd layer biases vector, of shape (F2, 1)
* - W3: 3rd layer weights (parameters) matrix, of shape (F2*(Hin/4)*(Win/4), N3)
* - b3: 3rd layer biases vector, of shape (1, N3)
* - W4: 4th layer weights (parameters) matrix, of shape (N3, K)
* - b4: 4th layer biases vector, of shape (1, K)
*
* Outputs:
* - probs: Class probabilities, of shape (N, K)
*/
predict = function(matrix[double] X, int C, int Hin, int Win, int batch_size, list[unknown] model)
return (matrix[double] probs) {
W1 = as.matrix(model[1])
W2 = as.matrix(model[2])
W3 = as.matrix(model[3])
W4 = as.matrix(model[4])
b1 = as.matrix(model[5])
b2 = as.matrix(model[6])
b3 = as.matrix(model[7])
b4 = as.matrix(model[8])
N = nrow(X)
# Network:
## input -> conv1 -> relu1 -> pool1 -> conv2 -> relu2 -> pool2 -> affine3 -> relu3 -> affine4 -> softmax
Hf = 5 # filter height
Wf = 5 # filter width
stride = 1
pad = 2 # For same dimensions, (Hf - stride) / 2
F1 = nrow(W1) # num conv filters in conv1
F2 = nrow(W2) # num conv filters in conv2
N3 = ncol(W3) # num nodes in affine3
K = ncol(W4) # num nodes in affine4, equal to number of target dimensions (num classes)
# Compute predictions over mini-batches
probs = matrix(0, rows=N, cols=K)
iters = ceil(N / batch_size)
parfor(i in 1:iters, check=0) {
# Get next batch
beg = ((i-1) * batch_size) %% N + 1
end = min(N, beg + batch_size - 1)
X_batch = X[beg:end,]
# Compute forward pass
## layer 1: conv1 -> relu1 -> pool1
[outc1, Houtc1, Woutc1] = conv2d::forward(X_batch, W1, b1, C, Hin, Win, Hf, Wf, stride, stride,
pad, pad)
outr1 = relu::forward(outc1)
[outp1, Houtp1, Woutp1] = max_pool2d::forward(outr1, F1, Houtc1, Woutc1, 2, 2, 2, 2, 0, 0)
## layer 2: conv2 -> relu2 -> pool2
[outc2, Houtc2, Woutc2] = conv2d::forward(outp1, W2, b2, F1, Houtp1, Woutp1, Hf, Wf,
stride, stride, pad, pad)
outr2 = relu::forward(outc2)
[outp2, Houtp2, Woutp2] = max_pool2d::forward(outr2, F2, Houtc2, Woutc2, 2, 2, 2, 2, 0, 0)
## layer 3: affine3 -> relu3
outa3 = affine::forward(outp2, W3, b3)
outr3 = relu::forward(outa3)
## layer 4: affine4 -> softmax
outa4 = affine::forward(outr3, W4, b4)
probs_batch = softmax::forward(outa4)
# Store predictions
probs[beg:end,] = probs_batch
}
}
/*
* Evaluates a convolutional net using the "LeNet" architecture.
*
* The probs matrix contains the class probability predictions
* of K classes over N examples. The targets, y, have K classes,
* and are one-hot encoded.
*
* Inputs:
* - probs: Class probabilities, of shape (N, K)
* - y: Target matrix, of shape (N, K)
*
* Outputs:
* - loss: Scalar loss, of shape (1)
* - accuracy: Scalar accuracy, of shape (1)
*/
eval = function(matrix[double] probs, matrix[double] y)
return (double loss, double accuracy) {
# Compute loss & accuracy
loss = cross_entropy_loss::forward(probs, y)
correct_pred = rowIndexMax(probs) == rowIndexMax(y)
accuracy = mean(correct_pred)
}
# Should always use 'features' (batch features), 'labels' (batch labels),
# 'hyperparams', 'model' as the arguments
# and return the gradients of type list
gradients = function(list[unknown] model,
list[unknown] hyperparams,
matrix[double] features,
matrix[double] labels)
return (list[unknown] gradients) {
C = as.integer(as.scalar(hyperparams["C"]))
Hin = as.integer(as.scalar(hyperparams["Hin"]))
Win = as.integer(as.scalar(hyperparams["Win"]))
Hf = as.integer(as.scalar(hyperparams["Hf"]))
Wf = as.integer(as.scalar(hyperparams["Wf"]))
stride = as.integer(as.scalar(hyperparams["stride"]))
pad = as.integer(as.scalar(hyperparams["pad"]))
lambda = as.double(as.scalar(hyperparams["lambda"]))
F1 = as.integer(as.scalar(hyperparams["F1"]))
F2 = as.integer(as.scalar(hyperparams["F2"]))
N3 = as.integer(as.scalar(hyperparams["N3"]))
W1 = as.matrix(model[1])
W2 = as.matrix(model[2])
W3 = as.matrix(model[3])
W4 = as.matrix(model[4])
b1 = as.matrix(model[5])
b2 = as.matrix(model[6])
b3 = as.matrix(model[7])
b4 = as.matrix(model[8])
# Compute forward pass
## layer 1: conv1 -> relu1 -> pool1
[outc1, Houtc1, Woutc1] = conv2d::forward(features, W1, b1, C, Hin, Win, Hf, Wf,
stride, stride, pad, pad)
outr1 = relu::forward(outc1)
[outp1, Houtp1, Woutp1] = max_pool2d::forward(outr1, F1, Houtc1, Woutc1, 2, 2, 2, 2, 0, 0)
## layer 2: conv2 -> relu2 -> pool2
[outc2, Houtc2, Woutc2] = conv2d::forward(outp1, W2, b2, F1, Houtp1, Woutp1, Hf, Wf,
stride, stride, pad, pad)
outr2 = relu::forward(outc2)
[outp2, Houtp2, Woutp2] = max_pool2d::forward(outr2, F2, Houtc2, Woutc2, 2, 2, 2, 2, 0, 0)
## layer 3: affine3 -> relu3 -> dropout
outa3 = affine::forward(outp2, W3, b3)
outr3 = relu::forward(outa3)
[outd3, maskd3] = dropout::forward(outr3, 0.5, -1)
## layer 4: affine4 -> softmax
outa4 = affine::forward(outd3, W4, b4)
probs = softmax::forward(outa4)
# Compute loss & accuracy for training data
loss = cross_entropy_loss::forward(probs, labels)
accuracy = mean(rowIndexMax(probs) == rowIndexMax(labels))
print("[+] Completed forward pass on batch: train loss: " + loss + ", train accuracy: " + accuracy)
# Compute data backward pass
## loss
dprobs = cross_entropy_loss::backward(probs, labels)
## layer 4: affine4 -> softmax
douta4 = softmax::backward(dprobs, outa4)
[doutd3, dW4, db4] = affine::backward(douta4, outr3, W4, b4)
## layer 3: affine3 -> relu3 -> dropout
doutr3 = dropout::backward(doutd3, outr3, 0.5, maskd3)
douta3 = relu::backward(doutr3, outa3)
[doutp2, dW3, db3] = affine::backward(douta3, outp2, W3, b3)
## layer 2: conv2 -> relu2 -> pool2
doutr2 = max_pool2d::backward(doutp2, Houtp2, Woutp2, outr2, F2, Houtc2, Woutc2, 2, 2, 2, 2, 0, 0)
doutc2 = relu::backward(doutr2, outc2)
[doutp1, dW2, db2] = conv2d::backward(doutc2, Houtc2, Woutc2, outp1, W2, b2, F1,
Houtp1, Woutp1, Hf, Wf, stride, stride, pad, pad)
## layer 1: conv1 -> relu1 -> pool1
doutr1 = max_pool2d::backward(doutp1, Houtp1, Woutp1, outr1, F1, Houtc1, Woutc1, 2, 2, 2, 2, 0, 0)
doutc1 = relu::backward(doutr1, outc1)
[dX_batch, dW1, db1] = conv2d::backward(doutc1, Houtc1, Woutc1, features, W1, b1, C, Hin, Win,
Hf, Wf, stride, stride, pad, pad)
# Compute regularization backward pass
dW1_reg = l2_reg::backward(W1, lambda)
dW2_reg = l2_reg::backward(W2, lambda)
dW3_reg = l2_reg::backward(W3, lambda)
dW4_reg = l2_reg::backward(W4, lambda)
dW1 = dW1 + dW1_reg
dW2 = dW2 + dW2_reg
dW3 = dW3 + dW3_reg
dW4 = dW4 + dW4_reg
gradients = list(dW1, dW2, dW3, dW4, db1, db2, db3, db4)
}
# Should use the arguments named 'model', 'gradients', 'hyperparams'
# and return always a model of type list
aggregation = function(list[unknown] model,
list[unknown] hyperparams,
list[unknown] gradients)
return (list[unknown] model_result) {
W1 = as.matrix(model[1])
W2 = as.matrix(model[2])
W3 = as.matrix(model[3])
W4 = as.matrix(model[4])
b1 = as.matrix(model[5])
b2 = as.matrix(model[6])
b3 = as.matrix(model[7])
b4 = as.matrix(model[8])
dW1 = as.matrix(gradients[1])
dW2 = as.matrix(gradients[2])
dW3 = as.matrix(gradients[3])
dW4 = as.matrix(gradients[4])
db1 = as.matrix(gradients[5])
db2 = as.matrix(gradients[6])
db3 = as.matrix(gradients[7])
db4 = as.matrix(gradients[8])
vW1 = as.matrix(model[9])
vW2 = as.matrix(model[10])
vW3 = as.matrix(model[11])
vW4 = as.matrix(model[12])
vb1 = as.matrix(model[13])
vb2 = as.matrix(model[14])
vb3 = as.matrix(model[15])
vb4 = as.matrix(model[16])
learning_rate = as.double(as.scalar(hyperparams["learning_rate"]))
mu = as.double(as.scalar(hyperparams["mu"]))
# Optimize with SGD w/ Nesterov momentum
[W1, vW1] = sgd_nesterov::update(W1, dW1, learning_rate, mu, vW1)
[b1, vb1] = sgd_nesterov::update(b1, db1, learning_rate, mu, vb1)
[W2, vW2] = sgd_nesterov::update(W2, dW2, learning_rate, mu, vW2)
[b2, vb2] = sgd_nesterov::update(b2, db2, learning_rate, mu, vb2)
[W3, vW3] = sgd_nesterov::update(W3, dW3, learning_rate, mu, vW3)
[b3, vb3] = sgd_nesterov::update(b3, db3, learning_rate, mu, vb3)
[W4, vW4] = sgd_nesterov::update(W4, dW4, learning_rate, mu, vW4)
[b4, vb4] = sgd_nesterov::update(b4, db4, learning_rate, mu, vb4)
model_result = list(W1, W2, W3, W4, b1, b2, b3, b4, vW1, vW2, vW3, vW4, vb1, vb2, vb3, vb4)
}