projects/breast_cancer/softmax_clf.dml - systemds - Git at Google

 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 #-------------------------------------------------------------

 /*
  * Breast Cancer Softmax Model
  */
 # Imports
 source("nn/layers/affine.dml") as affine
 source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
 source("nn/layers/softmax.dml") as softmax
 #source("nn/optim/adam.dml") as adam
 source("nn/optim/sgd_nesterov.dml") as sgd_nesterov

 train = function(matrix[double] X, matrix[double] Y,
                  matrix[double] X_val, matrix[double] Y_val,
                  double lr, double mu, double decay,
                  int batch_size, int epochs, int log_interval)
     return (matrix[double] W, matrix[double] b) {
   /*
    * Trains a softmax classifier.
    *
    * The input matrix, X, has N examples, each with D features.
    * The targets, Y, have K classes, and are one-hot encoded.
    *
    * Inputs:
    *  - X: Input data matrix, of shape (N, D).
    *  - Y: Target matrix, of shape (N, K).
    *  - X_val: Input validation data matrix, of shape (N, C*Hin*Win).
    *  - Y_val: Target validation matrix, of shape (N, K).
    *  - lr: Learning rate.
    *  - mu: Momentum value.
    *      Typical values are in the range of [0.5, 0.99], usually
    *      started at the lower end and annealed towards the higher end.
    *  - decay: Learning rate decay rate.
    *  - batch_size: Size of mini-batches to train on.
    *  - epochs: Total number of full training loops over the full data set.
    *  - log_interval: Interval, in iterations, between log outputs.
    *
    * Outputs:
    *  - W: Weights (parameters) matrix, of shape (D, K).
    *  - b: Biases vector, of shape (1, K).
    */
   N = nrow(Y)  # num examples
   D = ncol(X)  # num features
   K = ncol(Y)  # num classes

   # Create softmax classifier:
   # affine -> softmax
   [W, b] = affine::init(D, K)
   W = W / sqrt(2.0/(D)) * sqrt(1/(D))

   # Initialize SGD w/ Nesterov momentum optimizer
   vW = sgd_nesterov::init(W)  # optimizer momentum state for W
   vb = sgd_nesterov::init(b)  # optimizer momentum state for b
   #[mW, vW] = adam::init(W)  # optimizer 1st & 2nd moment state for W
   #[mb, vb] = adam::init(b)  # optimizer 1st & 2nd moment state for b

   # Starting validation loss & accuracy
   probs_val = predict(X_val, W, b)
   loss_val = cross_entropy_loss::forward(probs_val, Y_val)
   accuracy_val = mean(rowIndexMax(probs_val) == rowIndexMax(Y_val))
   # Output results
   print("Start: Val Loss: " + loss_val + ", Val Accuracy: " + accuracy_val)

   # Optimize
   print("Starting optimization")
   iters = ceil(N / batch_size)
   for (e in 1:epochs) {
     for(i in 1:iters) {
       # Get next batch
       beg = ((i-1) * batch_size) %% N + 1
       end = min(N, beg + batch_size - 1)
       #print("Epoch: " + e + ", Iter: " + i + ", X[" + beg + ":" + end + ",]")
       X_batch = X[beg:end,]
       Y_batch = Y[beg:end,]

       # Compute forward pass
       ## affine & softmax:
       out = affine::forward(X_batch, W, b)
       probs = softmax::forward(out)

       # Compute backward pass
       ## loss:
       dprobs = cross_entropy_loss::backward(probs, Y_batch)
       ## affine & softmax:
       dout = softmax::backward(dprobs, out)
       [dX_batch, dW, db] = affine::backward(dout, X_batch, W, b)

       # Optimize with SGD w/ Nesterov momentum
       [W, vW] = sgd_nesterov::update(W, dW, lr, mu, vW)
       [b, vb] = sgd_nesterov::update(b, db, lr, mu, vb)
       #[W, mW, vW] = adam::update(W, dW, lr, 0.9, 0.999, 1e-8, e*i-1, mW, vW)
       #[b, mb, vb] = adam::update(b, db, lr, 0.9, 0.999, 1e-8, e*i-1, mb, vb)

       # Compute loss & accuracy for training & validation data every `log_interval` iterations.
       if (i %% log_interval == 0) {
         #print("Eval time! - i: " + i)
         # Compute training loss & accuracy
         loss = cross_entropy_loss::forward(probs, Y_batch)
         accuracy = mean(rowIndexMax(probs) == rowIndexMax(Y_batch))

         # Compute validation loss & accuracy
         probs_val = predict(X_val, W, b)
         loss_val = cross_entropy_loss::forward(probs_val, Y_val)
         accuracy_val = mean(rowIndexMax(probs_val) == rowIndexMax(Y_val))

         # Output results
         print("Epoch: " + e + "/" + epochs + ", Iter: " + i + "/" + iters
               + ", Train Loss: " + loss + ", Train Accuracy: " + accuracy + ", Val Loss: "
               + loss_val + ", Val Accuracy: " + accuracy_val + ", lr: " + lr + ", mu " + mu)
       }
     }
     # Anneal momentum towards 0.999
     mu = mu + (0.999 - mu)/(1+epochs-e)
     # Decay learning rate
     lr = lr * decay
   }
 }

 predict = function(matrix[double] X, matrix[double] W, matrix[double] b)
     return (matrix[double] probs) {
   /*
    * Computes the class probability predictions of a softmax classifier.
    *
    * The input matrix, X, has N examples, each with D features.
    *
    * Inputs:
    *  - X: Input data matrix, of shape (N, D).
    *  - W: Weights (parameters) matrix, of shape (D, K).
    *  - b: Biases vector, of shape (1, K).
    *
    * Outputs:
    *  - probs: Class probabilities, of shape (N, K).
    */
   N = nrow(X)  # num examples
   K = ncol(W)  # num classes

   # Compute forward pass
   ## affine & softmax:
   out = affine::forward(X, W, b)
   probs = softmax::forward(out)
 }

 eval = function(matrix[double] probs, matrix[double] Y)
     return (double loss, double accuracy) {
   /*
    * Evaluates a softmax classifier.
    *
    * The probs matrix contains the class probability predictions
    * of K classes over N examples.  The targets, Y, have K classes,
    * and are one-hot encoded.
    *
    * Inputs:
    *  - probs: Class probabilities, of shape (N, K).
    *  - Y: Target matrix, of shape (N, K).
    *
    * Outputs:
    *  - loss: Scalar loss, of shape (1).
    *  - accuracy: Scalar accuracy, of shape (1).
    */
   # Compute loss & accuracy
   loss = cross_entropy_loss::forward(probs, Y)
   correct_pred = rowIndexMax(probs) == rowIndexMax(Y)
   accuracy = mean(correct_pred)
 }

 generate_dummy_data = function()
     return (matrix[double] X, matrix[double] Y, int C, int Hin, int Win) {
   /*
    * Generate a dummy dataset similar to the breast cancer dataset.
    *
    * Outputs:
    *  - X: Input data matrix, of shape (N, D).
    *  - Y: Target matrix, of shape (N, K).
    *  - C: Number of input channels (dimensionality of input depth).
    *  - Hin: Input height.
    *  - Win: Input width.
    */
   # Generate dummy input data
   N = 1024  # num examples
   C = 3  # num input channels
   Hin = 256  # input height
   Win = 256  # input width
   T = 10  # num targets
   X = rand(rows=N, cols=C*Hin*Win, pdf="normal")
   classes = round(rand(rows=N, cols=1, min=1, max=T, pdf="uniform"))
   Y = table(seq(1, N), classes)  # one-hot encoding
 }
	#-------------------------------------------------------------
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#
	#-------------------------------------------------------------

	/*
	* Breast Cancer Softmax Model
	*/
	# Imports
	source("nn/layers/affine.dml") as affine
	source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
	source("nn/layers/softmax.dml") as softmax
	#source("nn/optim/adam.dml") as adam
	source("nn/optim/sgd_nesterov.dml") as sgd_nesterov

	train = function(matrix[double] X, matrix[double] Y,
	matrix[double] X_val, matrix[double] Y_val,
	double lr, double mu, double decay,
	int batch_size, int epochs, int log_interval)
	return (matrix[double] W, matrix[double] b) {
	/*
	* Trains a softmax classifier.
	*
	* The input matrix, X, has N examples, each with D features.
	* The targets, Y, have K classes, and are one-hot encoded.
	*
	* Inputs:
	* - X: Input data matrix, of shape (N, D).
	* - Y: Target matrix, of shape (N, K).
	* - X_val: Input validation data matrix, of shape (N, CHinWin).
	* - Y_val: Target validation matrix, of shape (N, K).
	* - lr: Learning rate.
	* - mu: Momentum value.
	* Typical values are in the range of [0.5, 0.99], usually
	* started at the lower end and annealed towards the higher end.
	* - decay: Learning rate decay rate.
	* - batch_size: Size of mini-batches to train on.
	* - epochs: Total number of full training loops over the full data set.
	* - log_interval: Interval, in iterations, between log outputs.
	*
	* Outputs:
	* - W: Weights (parameters) matrix, of shape (D, K).
	* - b: Biases vector, of shape (1, K).
	*/
	N = nrow(Y) # num examples
	D = ncol(X) # num features
	K = ncol(Y) # num classes

	# Create softmax classifier:
	# affine -> softmax
	[W, b] = affine::init(D, K)
	W = W / sqrt(2.0/(D)) * sqrt(1/(D))

	# Initialize SGD w/ Nesterov momentum optimizer
	vW = sgd_nesterov::init(W) # optimizer momentum state for W
	vb = sgd_nesterov::init(b) # optimizer momentum state for b
	#[mW, vW] = adam::init(W) # optimizer 1st & 2nd moment state for W
	#[mb, vb] = adam::init(b) # optimizer 1st & 2nd moment state for b

	# Starting validation loss & accuracy
	probs_val = predict(X_val, W, b)
	loss_val = cross_entropy_loss::forward(probs_val, Y_val)
	accuracy_val = mean(rowIndexMax(probs_val) == rowIndexMax(Y_val))
	# Output results
	print("Start: Val Loss: " + loss_val + ", Val Accuracy: " + accuracy_val)

	# Optimize
	print("Starting optimization")
	iters = ceil(N / batch_size)
	for (e in 1:epochs) {
	for(i in 1:iters) {
	# Get next batch
	beg = ((i-1) * batch_size) %% N + 1
	end = min(N, beg + batch_size - 1)
	#print("Epoch: " + e + ", Iter: " + i + ", X[" + beg + ":" + end + ",]")
	X_batch = X[beg:end,]
	Y_batch = Y[beg:end,]

	# Compute forward pass
	## affine & softmax:
	out = affine::forward(X_batch, W, b)
	probs = softmax::forward(out)

	# Compute backward pass
	## loss:
	dprobs = cross_entropy_loss::backward(probs, Y_batch)
	## affine & softmax:
	dout = softmax::backward(dprobs, out)
	[dX_batch, dW, db] = affine::backward(dout, X_batch, W, b)

	# Optimize with SGD w/ Nesterov momentum
	[W, vW] = sgd_nesterov::update(W, dW, lr, mu, vW)
	[b, vb] = sgd_nesterov::update(b, db, lr, mu, vb)
	#[W, mW, vW] = adam::update(W, dW, lr, 0.9, 0.999, 1e-8, e*i-1, mW, vW)
	#[b, mb, vb] = adam::update(b, db, lr, 0.9, 0.999, 1e-8, e*i-1, mb, vb)

	# Compute loss & accuracy for training & validation data every `log_interval` iterations.
	if (i %% log_interval == 0) {
	#print("Eval time! - i: " + i)
	# Compute training loss & accuracy
	loss = cross_entropy_loss::forward(probs, Y_batch)
	accuracy = mean(rowIndexMax(probs) == rowIndexMax(Y_batch))

	# Compute validation loss & accuracy
	probs_val = predict(X_val, W, b)
	loss_val = cross_entropy_loss::forward(probs_val, Y_val)
	accuracy_val = mean(rowIndexMax(probs_val) == rowIndexMax(Y_val))

	# Output results
	print("Epoch: " + e + "/" + epochs + ", Iter: " + i + "/" + iters
	+ ", Train Loss: " + loss + ", Train Accuracy: " + accuracy + ", Val Loss: "
	+ loss_val + ", Val Accuracy: " + accuracy_val + ", lr: " + lr + ", mu " + mu)
	}
	}
	# Anneal momentum towards 0.999
	mu = mu + (0.999 - mu)/(1+epochs-e)
	# Decay learning rate
	lr = lr * decay
	}
	}

	predict = function(matrix[double] X, matrix[double] W, matrix[double] b)
	return (matrix[double] probs) {
	/*
	* Computes the class probability predictions of a softmax classifier.
	*
	* The input matrix, X, has N examples, each with D features.
	*
	* Inputs:
	* - X: Input data matrix, of shape (N, D).
	* - W: Weights (parameters) matrix, of shape (D, K).
	* - b: Biases vector, of shape (1, K).
	*
	* Outputs:
	* - probs: Class probabilities, of shape (N, K).
	*/
	N = nrow(X) # num examples
	K = ncol(W) # num classes

	# Compute forward pass
	## affine & softmax:
	out = affine::forward(X, W, b)
	probs = softmax::forward(out)
	}

	eval = function(matrix[double] probs, matrix[double] Y)
	return (double loss, double accuracy) {
	/*
	* Evaluates a softmax classifier.
	*
	* The probs matrix contains the class probability predictions
	* of K classes over N examples. The targets, Y, have K classes,
	* and are one-hot encoded.
	*
	* Inputs:
	* - probs: Class probabilities, of shape (N, K).
	* - Y: Target matrix, of shape (N, K).
	*
	* Outputs:
	* - loss: Scalar loss, of shape (1).
	* - accuracy: Scalar accuracy, of shape (1).
	*/
	# Compute loss & accuracy
	loss = cross_entropy_loss::forward(probs, Y)
	correct_pred = rowIndexMax(probs) == rowIndexMax(Y)
	accuracy = mean(correct_pred)
	}

	generate_dummy_data = function()
	return (matrix[double] X, matrix[double] Y, int C, int Hin, int Win) {
	/*
	* Generate a dummy dataset similar to the breast cancer dataset.
	*
	* Outputs:
	* - X: Input data matrix, of shape (N, D).
	* - Y: Target matrix, of shape (N, K).
	* - C: Number of input channels (dimensionality of input depth).
	* - Hin: Input height.
	* - Win: Input width.
	*/
	# Generate dummy input data
	N = 1024 # num examples
	C = 3 # num input channels
	Hin = 256 # input height
	Win = 256 # input width
	T = 10 # num targets
	X = rand(rows=N, cols=CHinWin, pdf="normal")
	classes = round(rand(rows=N, cols=1, min=1, max=T, pdf="uniform"))
	Y = table(seq(1, N), classes) # one-hot encoding
	}