scripts/nn/examples/mnist_2NN.dml - systemds - Git at Google

 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 #-------------------------------------------------------------

 /*
  * MNIST 2NN Leaky Relu Example
  */

 # Imports
 source("nn/layers/affine.dml") as affine
 source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
 source("nn/layers/leaky_relu.dml") as leaky_relu
 source("nn/layers/softmax.dml") as softmax
 source("nn/optim/sgd_nesterov.dml") as sgd_nesterov

 train = function(matrix[double] X, matrix[double] Y, matrix[double] X_val,
                  matrix[double] Y_val, int epochs)
   return (matrix[double] W_1, matrix[double] b_1, matrix[double] W_2,
 	        matrix[double] b_2, matrix[double] W_3, matrix[double] b_3)
 {
   /*
    * Trains a 2 hidden layer leaky relu softmax classifier.
    *
    * The input matrix, X, has N examples, each with D features.
    * The targets, Y, have K classes, and are one-hot encoded.
    *
    * Inputs:
    *  - X: Input data matrix, of shape (N, D).
    *  - Y: Target matrix, of shape (N, K).
    *  - X_val: Input validation data matrix, of shape (N, C*Hin*Win).
    *  - Y_val: Target validation matrix, of shape (N, K).
    *  - epochs: Total number of full training loops over the full data set.
    *
    * Outputs:
    *  - W: Weights (parameters) matrix, of shape (D, M, 3).
    *  - b: Biases vector, of shape (1, M, 3).
    */

   N = nrow(X)  # num examples
   D = ncol(X)  # num features
   K = ncol(Y)  # num classes

   # Create the network:
   # input -> 200 neuron affine -> leaky_relu -> 200 neuron affine -> leaky_relu -> K neurons affine -> softmax
   [W_1, b_1] = affine::init(D, 200)
   [W_2, b_2] = affine::init(200, 200)
   [W_3, b_3] = affine::init(200, K)

   # Initialize SGD
   lr = 0.2  # learning rate
   mu = 0  # momentum
   decay = 0.99  # learning rate decay constant
   vW_1 = sgd_nesterov::init(W_1)  # optimizer momentum state for W_1
   vb_1 = sgd_nesterov::init(b_1)  # optimizer momentum state for b_1
   vW_2 = sgd_nesterov::init(W_2)  # optimizer momentum state for W_2
   vb_2 = sgd_nesterov::init(b_2)  # optimizer momentum state for b_2
   vW_3 = sgd_nesterov::init(W_3)  # optimizer momentum state for W_3
   vb_3 = sgd_nesterov::init(b_3)  # optimizer momentum state for b_3

   # Optimize
   print("Starting optimization")
   batch_size = 50
   iters = 1000
   for (e in 1:epochs) {
     for(i in 1:iters) {
       # Get next batch
       beg = ((i-1) * batch_size) %% N + 1
       end = min(N, beg + batch_size - 1)
       X_batch = X[beg:end,]
       y_batch = Y[beg:end,]

       # Compute forward pass
       ## input D -> 200 neuron affine -> leaky_relu -> 200 neuron affine -> leaky_relu -> K neurons affine -> softmax
       out_1 = affine::forward(X_batch, W_1, b_1)
       out_1_leaky_relu = leaky_relu::forward(out_1)
       out_2 = affine::forward(out_1_leaky_relu, W_2, b_2)
       out_2_leaky_relu = leaky_relu::forward(out_2)
       out_3 = affine::forward(out_2_leaky_relu, W_3, b_3)
       probs = softmax::forward(out_3)

       # Compute loss & accuracy for training & validation data
       loss = cross_entropy_loss::forward(probs, y_batch)
       accuracy = mean(rowIndexMax(probs) == rowIndexMax(y_batch))
       probs_val = predict(X_val, W_1, b_1, W_2, b_2, W_3, b_3)
       loss_val = cross_entropy_loss::forward(probs_val, Y_val)
       accuracy_val = mean(rowIndexMax(probs_val) == rowIndexMax(Y_val))
       print("Epoch: " + e + ", Iter: " + i + ", Train Loss: " + loss + ", Train Accuracy: " +
       accuracy + ", Val Loss: " + loss_val + ", Val Accuracy: " + accuracy_val)

       # Compute backward pass
       ## loss:
       dprobs = cross_entropy_loss::backward(probs, y_batch)
       dout_3 = softmax::backward(dprobs, out_3)
       [dout_2_leaky_relu, dW_3, db_3] = affine::backward(dout_3, out_2_leaky_relu, W_3, b_3)
       dout_2 = leaky_relu::backward(dout_2_leaky_relu, out_2)
       [dout_1_leaky_relu, dW_2, db_2] = affine::backward(dout_2, out_1_leaky_relu, W_2, b_2)
       dout_1 = leaky_relu::backward(dout_1_leaky_relu, out_1)
       [dX_batch, dW_1, db_1] = affine::backward(dout_1, X_batch, W_1, b_1)

       # Optimize with SGD
       [W_3, vW_3] = sgd_nesterov::update(W_3, dW_3, lr, mu, vW_3)
       [b_3, vb_3] = sgd_nesterov::update(b_3, db_3, lr, mu, vb_3)
       [W_2, vW_2] = sgd_nesterov::update(W_2, dW_2, lr, mu, vW_2)
       [b_2, vb_2] = sgd_nesterov::update(b_2, db_2, lr, mu, vb_2)
       [W_1, vW_1] = sgd_nesterov::update(W_1, dW_1, lr, mu, vW_1)
       [b_1, vb_1] = sgd_nesterov::update(b_1, db_1, lr, mu, vb_1)
     }

     # Anneal momentum towards 0.999
     mu = mu + (0.999 - mu)/(1+epochs-e)
     # Decay learning rate
     lr = lr * decay
   }
 }

 predict = function(matrix[double] X,
                    matrix[double] W_1, matrix[double] b_1,
                    matrix[double] W_2, matrix[double] b_2,
                    matrix[double] W_3, matrix[double] b_3)
     return (matrix[double] probs) {
   /*
    * Computes the class probability predictions of a softmax classifier.
    *
    * The input matrix, X, has N examples, each with D features.
    *
    * Inputs:
    *  - X: Input data matrix, of shape (N, D).
    *  - W: Weights (parameters) matrix, of shape (D, M).
    *  - b: Biases vector, of shape (1, M).
    *
    * Outputs:
    *  - probs: Class probabilities, of shape (N, K).
    */
   # Compute forward pass
   ## input -> 200 neuron affine -> leaky_relu -> 200 neuron affine -> leaky_relu -> K neurons affine -> softmax
   out_1_leaky_relu = leaky_relu::forward(affine::forward(X, W_1, b_1))
   out_2_leaky_relu = leaky_relu::forward(affine::forward(out_1_leaky_relu, W_2, b_2))
   probs = softmax::forward(affine::forward(out_2_leaky_relu, W_3, b_3))
 }

 eval = function(matrix[double] probs, matrix[double] Y)
     return (double loss, double accuracy) {
   /*
    * Evaluates a classifier.
    *
    * The probs matrix contains the class probability predictions
    * of K classes over N examples.  The targets, Y, have K classes,
    * and are one-hot encoded.
    *
    * Inputs:
    *  - probs: Class probabilities, of shape (N, K).
    *  - Y: Target matrix, of shape (N, K).
    *
    * Outputs:
    *  - loss: Scalar loss, of shape (1).
    *  - accuracy: Scalar accuracy, of shape (1).
    */
   # Compute loss & accuracy
   loss = cross_entropy_loss::forward(probs, Y)
   correct_pred = rowIndexMax(probs) == rowIndexMax(Y)
   accuracy = mean(correct_pred)
 }
	#-------------------------------------------------------------
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#
	#-------------------------------------------------------------

	/*
	* MNIST 2NN Leaky Relu Example
	*/

	# Imports
	source("nn/layers/affine.dml") as affine
	source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
	source("nn/layers/leaky_relu.dml") as leaky_relu
	source("nn/layers/softmax.dml") as softmax
	source("nn/optim/sgd_nesterov.dml") as sgd_nesterov

	train = function(matrix[double] X, matrix[double] Y, matrix[double] X_val,
	matrix[double] Y_val, int epochs)
	return (matrix[double] W_1, matrix[double] b_1, matrix[double] W_2,
	matrix[double] b_2, matrix[double] W_3, matrix[double] b_3)
	{
	/*
	* Trains a 2 hidden layer leaky relu softmax classifier.
	*
	* The input matrix, X, has N examples, each with D features.
	* The targets, Y, have K classes, and are one-hot encoded.
	*
	* Inputs:
	* - X: Input data matrix, of shape (N, D).
	* - Y: Target matrix, of shape (N, K).
	* - X_val: Input validation data matrix, of shape (N, CHinWin).
	* - Y_val: Target validation matrix, of shape (N, K).
	* - epochs: Total number of full training loops over the full data set.
	*
	* Outputs:
	* - W: Weights (parameters) matrix, of shape (D, M, 3).
	* - b: Biases vector, of shape (1, M, 3).
	*/

	N = nrow(X) # num examples
	D = ncol(X) # num features
	K = ncol(Y) # num classes

	# Create the network:
	# input -> 200 neuron affine -> leaky_relu -> 200 neuron affine -> leaky_relu -> K neurons affine -> softmax
	[W_1, b_1] = affine::init(D, 200)
	[W_2, b_2] = affine::init(200, 200)
	[W_3, b_3] = affine::init(200, K)

	# Initialize SGD
	lr = 0.2 # learning rate
	mu = 0 # momentum
	decay = 0.99 # learning rate decay constant
	vW_1 = sgd_nesterov::init(W_1) # optimizer momentum state for W_1
	vb_1 = sgd_nesterov::init(b_1) # optimizer momentum state for b_1
	vW_2 = sgd_nesterov::init(W_2) # optimizer momentum state for W_2
	vb_2 = sgd_nesterov::init(b_2) # optimizer momentum state for b_2
	vW_3 = sgd_nesterov::init(W_3) # optimizer momentum state for W_3
	vb_3 = sgd_nesterov::init(b_3) # optimizer momentum state for b_3

	# Optimize
	print("Starting optimization")
	batch_size = 50
	iters = 1000
	for (e in 1:epochs) {
	for(i in 1:iters) {
	# Get next batch
	beg = ((i-1) * batch_size) %% N + 1
	end = min(N, beg + batch_size - 1)
	X_batch = X[beg:end,]
	y_batch = Y[beg:end,]

	# Compute forward pass
	## input D -> 200 neuron affine -> leaky_relu -> 200 neuron affine -> leaky_relu -> K neurons affine -> softmax
	out_1 = affine::forward(X_batch, W_1, b_1)
	out_1_leaky_relu = leaky_relu::forward(out_1)
	out_2 = affine::forward(out_1_leaky_relu, W_2, b_2)
	out_2_leaky_relu = leaky_relu::forward(out_2)
	out_3 = affine::forward(out_2_leaky_relu, W_3, b_3)
	probs = softmax::forward(out_3)

	# Compute loss & accuracy for training & validation data
	loss = cross_entropy_loss::forward(probs, y_batch)
	accuracy = mean(rowIndexMax(probs) == rowIndexMax(y_batch))
	probs_val = predict(X_val, W_1, b_1, W_2, b_2, W_3, b_3)
	loss_val = cross_entropy_loss::forward(probs_val, Y_val)
	accuracy_val = mean(rowIndexMax(probs_val) == rowIndexMax(Y_val))
	print("Epoch: " + e + ", Iter: " + i + ", Train Loss: " + loss + ", Train Accuracy: " +
	accuracy + ", Val Loss: " + loss_val + ", Val Accuracy: " + accuracy_val)

	# Compute backward pass
	## loss:
	dprobs = cross_entropy_loss::backward(probs, y_batch)
	dout_3 = softmax::backward(dprobs, out_3)
	[dout_2_leaky_relu, dW_3, db_3] = affine::backward(dout_3, out_2_leaky_relu, W_3, b_3)
	dout_2 = leaky_relu::backward(dout_2_leaky_relu, out_2)
	[dout_1_leaky_relu, dW_2, db_2] = affine::backward(dout_2, out_1_leaky_relu, W_2, b_2)
	dout_1 = leaky_relu::backward(dout_1_leaky_relu, out_1)
	[dX_batch, dW_1, db_1] = affine::backward(dout_1, X_batch, W_1, b_1)

	# Optimize with SGD
	[W_3, vW_3] = sgd_nesterov::update(W_3, dW_3, lr, mu, vW_3)
	[b_3, vb_3] = sgd_nesterov::update(b_3, db_3, lr, mu, vb_3)
	[W_2, vW_2] = sgd_nesterov::update(W_2, dW_2, lr, mu, vW_2)
	[b_2, vb_2] = sgd_nesterov::update(b_2, db_2, lr, mu, vb_2)
	[W_1, vW_1] = sgd_nesterov::update(W_1, dW_1, lr, mu, vW_1)
	[b_1, vb_1] = sgd_nesterov::update(b_1, db_1, lr, mu, vb_1)
	}

	# Anneal momentum towards 0.999
	mu = mu + (0.999 - mu)/(1+epochs-e)
	# Decay learning rate
	lr = lr * decay
	}
	}

	predict = function(matrix[double] X,
	matrix[double] W_1, matrix[double] b_1,
	matrix[double] W_2, matrix[double] b_2,
	matrix[double] W_3, matrix[double] b_3)
	return (matrix[double] probs) {
	/*
	* Computes the class probability predictions of a softmax classifier.
	*
	* The input matrix, X, has N examples, each with D features.
	*
	* Inputs:
	* - X: Input data matrix, of shape (N, D).
	* - W: Weights (parameters) matrix, of shape (D, M).
	* - b: Biases vector, of shape (1, M).
	*
	* Outputs:
	* - probs: Class probabilities, of shape (N, K).
	*/
	# Compute forward pass
	## input -> 200 neuron affine -> leaky_relu -> 200 neuron affine -> leaky_relu -> K neurons affine -> softmax
	out_1_leaky_relu = leaky_relu::forward(affine::forward(X, W_1, b_1))
	out_2_leaky_relu = leaky_relu::forward(affine::forward(out_1_leaky_relu, W_2, b_2))
	probs = softmax::forward(affine::forward(out_2_leaky_relu, W_3, b_3))
	}

	eval = function(matrix[double] probs, matrix[double] Y)
	return (double loss, double accuracy) {
	/*
	* Evaluates a classifier.
	*
	* The probs matrix contains the class probability predictions
	* of K classes over N examples. The targets, Y, have K classes,
	* and are one-hot encoded.
	*
	* Inputs:
	* - probs: Class probabilities, of shape (N, K).
	* - Y: Target matrix, of shape (N, K).
	*
	* Outputs:
	* - loss: Scalar loss, of shape (1).
	* - accuracy: Scalar accuracy, of shape (1).
	*/
	# Compute loss & accuracy
	loss = cross_entropy_loss::forward(probs, Y)
	correct_pred = rowIndexMax(probs) == rowIndexMax(Y)
	accuracy = mean(correct_pred)
	}