src/test/scripts/functions/federated/paramserv/TwoNN.dml - systemds - Git at Google

 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 #-------------------------------------------------------------

 /*
  * This file implements all needed functions to evaluate a simple feed forward neural network
  * on different execution schemes and with different inputs, for example a federated input matrix.
  */

 # Imports
 source("nn/layers/affine.dml") as affine
 source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
 source("nn/layers/relu.dml") as relu
 source("nn/layers/softmax.dml") as softmax
 source("nn/optim/sgd.dml") as sgd

 /*
  * Trains a simple feed forward neural network with two hidden layers single threaded the conventional way.
  *
  * The input matrix has one example per row (N) and D features.
  * The targets, y, have K classes, and are one-hot encoded.
  *
  * Inputs:
  *  - X: Input data matrix of shape (N, D)
  *  - y: Target matrix of shape (N, K)
  *  - X_val: Input validation data matrix of shape (N_val, D)
  *  - y_val: Targed validation matrix of shape (N_val, K)
  *  - epochs: Total number of full training loops over the full data set
  *  - batch_size: Batch size
  *  - learning_rate: The learning rate for the SGD
  *
  * Outputs:
  *  - model_trained: List containing
  *       - W1: 1st layer weights (parameters) matrix, of shape (D, 200)
  *       - b1: 1st layer biases vector, of shape (200, 1)
  *       - W2: 2nd layer weights (parameters) matrix, of shape (200, 200)
  *       - b2: 2nd layer biases vector, of shape (200, 1)
  *       - W3: 3rd layer weights (parameters) matrix, of shape (200, K)
  *       - b3: 3rd layer biases vector, of shape (K, 1)
  */
 train = function(matrix[double] X, matrix[double] y,
                  matrix[double] X_val, matrix[double] y_val,
                  int epochs, int batch_size, double learning_rate)
     return (list[unknown] model_trained) {

   N = nrow(X)  # num examples
   D = ncol(X)  # num features
   K = ncol(y)  # num classes

   # Create the network:
   ## input -> affine1 -> relu1 -> affine2 -> relu2 -> affine3 -> softmax
   [W1, b1] = affine::init(D, 200)
   [W2, b2] = affine::init(200, 200)
   [W3, b3] = affine::init(200, K)
   W3 = W3 / sqrt(2)  # different initialization, since being fed into softmax, instead of relu

   # Create the hyper parameter list
   hyperparams = list(learning_rate=learning_rate)
   # Calculate iterations
   iters = ceil(N / batch_size)
   print_interval = floor(iters / 25)

   print("[+] Starting optimization")
   print("[+]  Learning rate: " + learning_rate)
   print("[+]  Batch size: " + batch_size)
   print("[+]  Iterations per epoch: " + iters + "\n")

   for (e in 1:epochs) {
     print("[+] Starting epoch: " + e)
     print("|")
     for(i in 1:iters) {
       # Create the model list
       model_list = list(W1, W2, W3, b1, b2, b3)

       # Get next batch
       beg = ((i-1) * batch_size) %% N + 1
       end = min(N, beg + batch_size - 1)
       X_batch = X[beg:end,]
       y_batch = y[beg:end,]

       gradients_list = gradients(model_list, hyperparams, X_batch, y_batch)
       model_updated = aggregation(model_list, hyperparams, gradients_list)

       W1 = as.matrix(model_updated[1])
       W2 = as.matrix(model_updated[2])
       W3 = as.matrix(model_updated[3])
       b1 = as.matrix(model_updated[4])
       b2 = as.matrix(model_updated[5])
       b3 = as.matrix(model_updated[6])

       if((i %% print_interval) == 0) {
         print("█")
       }
     }
     print("|")
   }

   model_trained = list(W1, W2, W3, b1, b2, b3)
 }

 /*
  * Trains a simple feed forward neural network with two hidden layers
  * using a parameter server with specified properties.
  *
  * The input matrix has one example per row (N) and D features.
  * The targets, y, have K classes, and are one-hot encoded.
  *
  * Inputs:
  *  - X: Input data matrix of shape (N, D)
  *  - y: Target matrix of shape (N, K)
  *  - X_val: Input validation data matrix of shape (N_val, D)
  *  - y_val: Targed validation matrix of shape (N_val, K)
  *  - epochs: Total number of full training loops over the full data set
  *  - batch_size: Batch size
  *  - learning_rate: The learning rate for the SGD
  *  - workers: Number of workers to create
  *  - utype: parameter server framework to use
  *  - scheme: update schema
  *  - mode: local or distributed
  *
  * Outputs:
  *  - model_trained: List containing
  *       - W1: 1st layer weights (parameters) matrix, of shape (D, 200)
  *       - b1: 1st layer biases vector, of shape (200, 1)
  *       - W2: 2nd layer weights (parameters) matrix, of shape (200, 200)
  *       - b2: 2nd layer biases vector, of shape (200, 1)
  *       - W3: 3rd layer weights (parameters) matrix, of shape (200, K)
  *       - b3: 3rd layer biases vector, of shape (K, 1)
  */
 train_paramserv = function(matrix[double] X, matrix[double] y,
                  matrix[double] X_val, matrix[double] y_val,
                  int epochs, int workers,
                  string utype, string freq, int batch_size, string scheme, string mode, double learning_rate)
     return (list[unknown] model_trained) {

   N = nrow(X)  # num examples
   D = ncol(X)  # num features
   K = ncol(y)  # num classes

   # Create the network:
   ## input -> affine1 -> relu1 -> affine2 -> relu2 -> affine3 -> softmax
   [W1, b1] = affine::init(D, 200)
   [W2, b2] = affine::init(200, 200)
   [W3, b3] = affine::init(200, K)

   # Create the model list
   model_list = list(W1, W2, W3, b1, b2, b3)
   # Create the hyper parameter list
   params = list(learning_rate=learning_rate)
   # Use paramserv function
   model_trained = paramserv(model=model_list, features=X, labels=y, val_features=X_val, val_labels=y_val, upd="./src/test/scripts/functions/federated/paramserv/TwoNN.dml::gradients", agg="./src/test/scripts/functions/federated/paramserv/TwoNN.dml::aggregation", mode=mode, utype=utype, freq=freq, epochs=epochs, batchsize=batch_size, k=workers, scheme=scheme, hyperparams=params, checkpointing="NONE")
 }

 /*
  * Computes the class probability predictions of a simple feed forward neural network.
  *
  * Inputs:
  *  - X: The input data matrix of shape (N, D)
  *  - model: List containing
  *       - W1: 1st layer weights (parameters) matrix, of shape (D, 200)
  *       - b1: 1st layer biases vector, of shape (200, 1)
  *       - W2: 2nd layer weights (parameters) matrix, of shape (200, 200)
  *       - b2: 2nd layer biases vector, of shape (200, 1)
  *       - W3: 3rd layer weights (parameters) matrix, of shape (200, K)
  *       - b3: 3rd layer biases vector, of shape (K, 1)
  *
  * Outputs:
  *  - probs: Class probabilities, of shape (N, K)
  */
 predict = function(matrix[double] X,
                    list[unknown] model)
     return (matrix[double] probs) {

   W1 = as.matrix(model[1])
   W2 = as.matrix(model[2])
   W3 = as.matrix(model[3])
   b1 = as.matrix(model[4])
   b2 = as.matrix(model[5])
   b3 = as.matrix(model[6])

   out1relu = relu::forward(affine::forward(X, W1, b1))
   out2relu = relu::forward(affine::forward(out1relu, W2, b2))
   probs = softmax::forward(affine::forward(out2relu, W3, b3))
 }

 /*
  * Evaluates a simple feed forward neural network.
  *
  * The probs matrix contains the class probability predictions
  * of K classes over N examples.  The targets, y, have K classes,
  * and are one-hot encoded.
  *
  * Inputs:
  *  - probs: Class probabilities, of shape (N, K).
  *  - y: Target matrix, of shape (N, K).
  *
  * Outputs:
  *  - loss: Scalar loss, of shape (1).
  *  - accuracy: Scalar accuracy, of shape (1).
  */
 eval = function(matrix[double] probs, matrix[double] y)
     return (double loss, double accuracy) {

   # Compute loss & accuracy
   loss = cross_entropy_loss::forward(probs, y)
   correct_pred = rowIndexMax(probs) == rowIndexMax(y)
   accuracy = mean(correct_pred)
 }

 # Should always use 'features' (batch features), 'labels' (batch labels),
 # 'hyperparams', 'model' as the arguments
 # and return the gradients of type list
 gradients = function(list[unknown] model,
                      list[unknown] hyperparams,
                      matrix[double] features,
                      matrix[double] labels)
     return (list[unknown] gradients) {

   W1 = as.matrix(model[1])
   W2 = as.matrix(model[2])
   W3 = as.matrix(model[3])
   b1 = as.matrix(model[4])
   b2 = as.matrix(model[5])
   b3 = as.matrix(model[6])

   # Compute forward pass
   ## input -> affine1 -> relu1 -> affine2 -> relu2 -> affine3 -> softmax
   out1 = affine::forward(features, W1, b1)
   out1relu = relu::forward(out1)
   out2 = affine::forward(out1relu, W2, b2)
   out2relu = relu::forward(out2)
   out3 = affine::forward(out2relu, W3, b3)
   probs = softmax::forward(out3)

   # Compute loss & accuracy for training data
   loss = cross_entropy_loss::forward(probs, labels)
   accuracy = mean(rowIndexMax(probs) == rowIndexMax(labels))
   print("[+] Completed forward pass on batch: train loss: " + loss + ", train accuracy: " + accuracy)

   # Compute data backward pass
   dprobs = cross_entropy_loss::backward(probs, labels)
   dout3 = softmax::backward(dprobs, out3)
   [dout2relu, dW3, db3] = affine::backward(dout3, out2relu, W3, b3)
   dout2 = relu::backward(dout2relu, out2)
   [dout1relu, dW2, db2] = affine::backward(dout2, out1relu, W2, b2)
   dout1 = relu::backward(dout1relu, out1)
   [dfeatures, dW1, db1] = affine::backward(dout1, features, W1, b1)

   gradients = list(dW1, dW2, dW3, db1, db2, db3)
 }

 # Should use the arguments named 'model', 'gradients', 'hyperparams'
 # and return always a model of type list
 aggregation = function(list[unknown] model,
                        list[unknown] hyperparams,
                        list[unknown] gradients)
     return (list[unknown] model_result) {

   W1 = as.matrix(model[1])
   W2 = as.matrix(model[2])
   W3 = as.matrix(model[3])
   b1 = as.matrix(model[4])
   b2 = as.matrix(model[5])
   b3 = as.matrix(model[6])
   dW1 = as.matrix(gradients[1])
   dW2 = as.matrix(gradients[2])
   dW3 = as.matrix(gradients[3])
   db1 = as.matrix(gradients[4])
   db2 = as.matrix(gradients[5])
   db3 = as.matrix(gradients[6])
   learning_rate = as.double(as.scalar(hyperparams["learning_rate"]))

   # Optimize with SGD
   W3 = sgd::update(W3, dW3, learning_rate)
   b3 = sgd::update(b3, db3, learning_rate)
   W2 = sgd::update(W2, dW2, learning_rate)
   b2 = sgd::update(b2, db2, learning_rate)
   W1 = sgd::update(W1, dW1, learning_rate)
   b1 = sgd::update(b1, db1, learning_rate)

   model_result = list(W1, W2, W3, b1, b2, b3)
 }
	#-------------------------------------------------------------
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#
	#-------------------------------------------------------------

	/*
	* This file implements all needed functions to evaluate a simple feed forward neural network
	* on different execution schemes and with different inputs, for example a federated input matrix.
	*/

	# Imports
	source("nn/layers/affine.dml") as affine
	source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
	source("nn/layers/relu.dml") as relu
	source("nn/layers/softmax.dml") as softmax
	source("nn/optim/sgd.dml") as sgd

	/*
	* Trains a simple feed forward neural network with two hidden layers single threaded the conventional way.
	*
	* The input matrix has one example per row (N) and D features.
	* The targets, y, have K classes, and are one-hot encoded.
	*
	* Inputs:
	* - X: Input data matrix of shape (N, D)
	* - y: Target matrix of shape (N, K)
	* - X_val: Input validation data matrix of shape (N_val, D)
	* - y_val: Targed validation matrix of shape (N_val, K)
	* - epochs: Total number of full training loops over the full data set
	* - batch_size: Batch size
	* - learning_rate: The learning rate for the SGD
	*
	* Outputs:
	* - model_trained: List containing
	* - W1: 1st layer weights (parameters) matrix, of shape (D, 200)
	* - b1: 1st layer biases vector, of shape (200, 1)
	* - W2: 2nd layer weights (parameters) matrix, of shape (200, 200)
	* - b2: 2nd layer biases vector, of shape (200, 1)
	* - W3: 3rd layer weights (parameters) matrix, of shape (200, K)
	* - b3: 3rd layer biases vector, of shape (K, 1)
	*/
	train = function(matrix[double] X, matrix[double] y,
	matrix[double] X_val, matrix[double] y_val,
	int epochs, int batch_size, double learning_rate)
	return (list[unknown] model_trained) {

	N = nrow(X) # num examples
	D = ncol(X) # num features
	K = ncol(y) # num classes

	# Create the network:
	## input -> affine1 -> relu1 -> affine2 -> relu2 -> affine3 -> softmax
	[W1, b1] = affine::init(D, 200)
	[W2, b2] = affine::init(200, 200)
	[W3, b3] = affine::init(200, K)
	W3 = W3 / sqrt(2) # different initialization, since being fed into softmax, instead of relu

	# Create the hyper parameter list
	hyperparams = list(learning_rate=learning_rate)
	# Calculate iterations
	iters = ceil(N / batch_size)
	print_interval = floor(iters / 25)

	print("[+] Starting optimization")
	print("[+] Learning rate: " + learning_rate)
	print("[+] Batch size: " + batch_size)
	print("[+] Iterations per epoch: " + iters + "\n")

	for (e in 1:epochs) {
	print("[+] Starting epoch: " + e)
	print("\|")
	for(i in 1:iters) {
	# Create the model list
	model_list = list(W1, W2, W3, b1, b2, b3)

	# Get next batch
	beg = ((i-1) * batch_size) %% N + 1
	end = min(N, beg + batch_size - 1)
	X_batch = X[beg:end,]
	y_batch = y[beg:end,]

	gradients_list = gradients(model_list, hyperparams, X_batch, y_batch)
	model_updated = aggregation(model_list, hyperparams, gradients_list)

	W1 = as.matrix(model_updated[1])
	W2 = as.matrix(model_updated[2])
	W3 = as.matrix(model_updated[3])
	b1 = as.matrix(model_updated[4])
	b2 = as.matrix(model_updated[5])
	b3 = as.matrix(model_updated[6])

	if((i %% print_interval) == 0) {
	print("█")
	}
	}
	print("\|")
	}

	model_trained = list(W1, W2, W3, b1, b2, b3)
	}

	/*
	* Trains a simple feed forward neural network with two hidden layers
	* using a parameter server with specified properties.
	*
	* The input matrix has one example per row (N) and D features.
	* The targets, y, have K classes, and are one-hot encoded.
	*
	* Inputs:
	* - X: Input data matrix of shape (N, D)
	* - y: Target matrix of shape (N, K)
	* - X_val: Input validation data matrix of shape (N_val, D)
	* - y_val: Targed validation matrix of shape (N_val, K)
	* - epochs: Total number of full training loops over the full data set
	* - batch_size: Batch size
	* - learning_rate: The learning rate for the SGD
	* - workers: Number of workers to create
	* - utype: parameter server framework to use
	* - scheme: update schema
	* - mode: local or distributed
	*
	* Outputs:
	* - model_trained: List containing
	* - W1: 1st layer weights (parameters) matrix, of shape (D, 200)
	* - b1: 1st layer biases vector, of shape (200, 1)
	* - W2: 2nd layer weights (parameters) matrix, of shape (200, 200)
	* - b2: 2nd layer biases vector, of shape (200, 1)
	* - W3: 3rd layer weights (parameters) matrix, of shape (200, K)
	* - b3: 3rd layer biases vector, of shape (K, 1)
	*/
	train_paramserv = function(matrix[double] X, matrix[double] y,
	matrix[double] X_val, matrix[double] y_val,
	int epochs, int workers,
	string utype, string freq, int batch_size, string scheme, string mode, double learning_rate)
	return (list[unknown] model_trained) {

	N = nrow(X) # num examples
	D = ncol(X) # num features
	K = ncol(y) # num classes

	# Create the network:
	## input -> affine1 -> relu1 -> affine2 -> relu2 -> affine3 -> softmax
	[W1, b1] = affine::init(D, 200)
	[W2, b2] = affine::init(200, 200)
	[W3, b3] = affine::init(200, K)

	# Create the model list
	model_list = list(W1, W2, W3, b1, b2, b3)
	# Create the hyper parameter list
	params = list(learning_rate=learning_rate)
	# Use paramserv function
	model_trained = paramserv(model=model_list, features=X, labels=y, val_features=X_val, val_labels=y_val, upd="./src/test/scripts/functions/federated/paramserv/TwoNN.dml::gradients", agg="./src/test/scripts/functions/federated/paramserv/TwoNN.dml::aggregation", mode=mode, utype=utype, freq=freq, epochs=epochs, batchsize=batch_size, k=workers, scheme=scheme, hyperparams=params, checkpointing="NONE")
	}

	/*
	* Computes the class probability predictions of a simple feed forward neural network.
	*
	* Inputs:
	* - X: The input data matrix of shape (N, D)
	* - model: List containing
	* - W1: 1st layer weights (parameters) matrix, of shape (D, 200)
	* - b1: 1st layer biases vector, of shape (200, 1)
	* - W2: 2nd layer weights (parameters) matrix, of shape (200, 200)
	* - b2: 2nd layer biases vector, of shape (200, 1)
	* - W3: 3rd layer weights (parameters) matrix, of shape (200, K)
	* - b3: 3rd layer biases vector, of shape (K, 1)
	*
	* Outputs:
	* - probs: Class probabilities, of shape (N, K)
	*/
	predict = function(matrix[double] X,
	list[unknown] model)
	return (matrix[double] probs) {

	W1 = as.matrix(model[1])
	W2 = as.matrix(model[2])
	W3 = as.matrix(model[3])
	b1 = as.matrix(model[4])
	b2 = as.matrix(model[5])
	b3 = as.matrix(model[6])

	out1relu = relu::forward(affine::forward(X, W1, b1))
	out2relu = relu::forward(affine::forward(out1relu, W2, b2))
	probs = softmax::forward(affine::forward(out2relu, W3, b3))
	}

	/*
	* Evaluates a simple feed forward neural network.
	*
	* The probs matrix contains the class probability predictions
	* of K classes over N examples. The targets, y, have K classes,
	* and are one-hot encoded.
	*
	* Inputs:
	* - probs: Class probabilities, of shape (N, K).
	* - y: Target matrix, of shape (N, K).
	*
	* Outputs:
	* - loss: Scalar loss, of shape (1).
	* - accuracy: Scalar accuracy, of shape (1).
	*/
	eval = function(matrix[double] probs, matrix[double] y)
	return (double loss, double accuracy) {

	# Compute loss & accuracy
	loss = cross_entropy_loss::forward(probs, y)
	correct_pred = rowIndexMax(probs) == rowIndexMax(y)
	accuracy = mean(correct_pred)
	}

	# Should always use 'features' (batch features), 'labels' (batch labels),
	# 'hyperparams', 'model' as the arguments
	# and return the gradients of type list
	gradients = function(list[unknown] model,
	list[unknown] hyperparams,
	matrix[double] features,
	matrix[double] labels)
	return (list[unknown] gradients) {

	W1 = as.matrix(model[1])
	W2 = as.matrix(model[2])
	W3 = as.matrix(model[3])
	b1 = as.matrix(model[4])
	b2 = as.matrix(model[5])
	b3 = as.matrix(model[6])

	# Compute forward pass
	## input -> affine1 -> relu1 -> affine2 -> relu2 -> affine3 -> softmax
	out1 = affine::forward(features, W1, b1)
	out1relu = relu::forward(out1)
	out2 = affine::forward(out1relu, W2, b2)
	out2relu = relu::forward(out2)
	out3 = affine::forward(out2relu, W3, b3)
	probs = softmax::forward(out3)

	# Compute loss & accuracy for training data
	loss = cross_entropy_loss::forward(probs, labels)
	accuracy = mean(rowIndexMax(probs) == rowIndexMax(labels))
	print("[+] Completed forward pass on batch: train loss: " + loss + ", train accuracy: " + accuracy)

	# Compute data backward pass
	dprobs = cross_entropy_loss::backward(probs, labels)
	dout3 = softmax::backward(dprobs, out3)
	[dout2relu, dW3, db3] = affine::backward(dout3, out2relu, W3, b3)
	dout2 = relu::backward(dout2relu, out2)
	[dout1relu, dW2, db2] = affine::backward(dout2, out1relu, W2, b2)
	dout1 = relu::backward(dout1relu, out1)
	[dfeatures, dW1, db1] = affine::backward(dout1, features, W1, b1)

	gradients = list(dW1, dW2, dW3, db1, db2, db3)
	}

	# Should use the arguments named 'model', 'gradients', 'hyperparams'
	# and return always a model of type list
	aggregation = function(list[unknown] model,
	list[unknown] hyperparams,
	list[unknown] gradients)
	return (list[unknown] model_result) {

	W1 = as.matrix(model[1])
	W2 = as.matrix(model[2])
	W3 = as.matrix(model[3])
	b1 = as.matrix(model[4])
	b2 = as.matrix(model[5])
	b3 = as.matrix(model[6])
	dW1 = as.matrix(gradients[1])
	dW2 = as.matrix(gradients[2])
	dW3 = as.matrix(gradients[3])
	db1 = as.matrix(gradients[4])
	db2 = as.matrix(gradients[5])
	db3 = as.matrix(gradients[6])
	learning_rate = as.double(as.scalar(hyperparams["learning_rate"]))

	# Optimize with SGD
	W3 = sgd::update(W3, dW3, learning_rate)
	b3 = sgd::update(b3, db3, learning_rate)
	W2 = sgd::update(W2, dW2, learning_rate)
	b2 = sgd::update(b2, db2, learning_rate)
	W1 = sgd::update(W1, dW1, learning_rate)
	b1 = sgd::update(b1, db1, learning_rate)

	model_result = list(W1, W2, W3, b1, b2, b3)
	}