blob: 35fd545f21d36092779c073695f5fabacc794d33 [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
/*
* Breast Cancer Softmax Model
*/
# Imports
source("nn/layers/affine.dml") as affine
source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
source("nn/layers/softmax.dml") as softmax
#source("nn/optim/adam.dml") as adam
source("nn/optim/sgd_nesterov.dml") as sgd_nesterov
train = function(matrix[double] X, matrix[double] Y,
matrix[double] X_val, matrix[double] Y_val,
double lr, double mu, double decay,
int batch_size, int epochs, int log_interval)
return (matrix[double] W, matrix[double] b) {
/*
* Trains a softmax classifier.
*
* The input matrix, X, has N examples, each with D features.
* The targets, Y, have K classes, and are one-hot encoded.
*
* Inputs:
* - X: Input data matrix, of shape (N, D).
* - Y: Target matrix, of shape (N, K).
* - X_val: Input validation data matrix, of shape (N, C*Hin*Win).
* - Y_val: Target validation matrix, of shape (N, K).
* - lr: Learning rate.
* - mu: Momentum value.
* Typical values are in the range of [0.5, 0.99], usually
* started at the lower end and annealed towards the higher end.
* - decay: Learning rate decay rate.
* - batch_size: Size of mini-batches to train on.
* - epochs: Total number of full training loops over the full data set.
* - log_interval: Interval, in iterations, between log outputs.
*
* Outputs:
* - W: Weights (parameters) matrix, of shape (D, K).
* - b: Biases vector, of shape (1, K).
*/
N = nrow(Y) # num examples
D = ncol(X) # num features
K = ncol(Y) # num classes
# Create softmax classifier:
# affine -> softmax
[W, b] = affine::init(D, K)
W = W / sqrt(2.0/(D)) * sqrt(1/(D))
# Initialize SGD w/ Nesterov momentum optimizer
vW = sgd_nesterov::init(W) # optimizer momentum state for W
vb = sgd_nesterov::init(b) # optimizer momentum state for b
#[mW, vW] = adam::init(W) # optimizer 1st & 2nd moment state for W
#[mb, vb] = adam::init(b) # optimizer 1st & 2nd moment state for b
# Starting validation loss & accuracy
probs_val = predict(X_val, W, b)
loss_val = cross_entropy_loss::forward(probs_val, Y_val)
accuracy_val = mean(rowIndexMax(probs_val) == rowIndexMax(Y_val))
# Output results
print("Start: Val Loss: " + loss_val + ", Val Accuracy: " + accuracy_val)
# Optimize
print("Starting optimization")
iters = ceil(N / batch_size)
for (e in 1:epochs) {
for(i in 1:iters) {
# Get next batch
beg = ((i-1) * batch_size) %% N + 1
end = min(N, beg + batch_size - 1)
#print("Epoch: " + e + ", Iter: " + i + ", X[" + beg + ":" + end + ",]")
X_batch = X[beg:end,]
Y_batch = Y[beg:end,]
# Compute forward pass
## affine & softmax:
out = affine::forward(X_batch, W, b)
probs = softmax::forward(out)
# Compute backward pass
## loss:
dprobs = cross_entropy_loss::backward(probs, Y_batch)
## affine & softmax:
dout = softmax::backward(dprobs, out)
[dX_batch, dW, db] = affine::backward(dout, X_batch, W, b)
# Optimize with SGD w/ Nesterov momentum
[W, vW] = sgd_nesterov::update(W, dW, lr, mu, vW)
[b, vb] = sgd_nesterov::update(b, db, lr, mu, vb)
#[W, mW, vW] = adam::update(W, dW, lr, 0.9, 0.999, 1e-8, e*i-1, mW, vW)
#[b, mb, vb] = adam::update(b, db, lr, 0.9, 0.999, 1e-8, e*i-1, mb, vb)
# Compute loss & accuracy for training & validation data every `log_interval` iterations.
if (i %% log_interval == 0) {
#print("Eval time! - i: " + i)
# Compute training loss & accuracy
loss = cross_entropy_loss::forward(probs, Y_batch)
accuracy = mean(rowIndexMax(probs) == rowIndexMax(Y_batch))
# Compute validation loss & accuracy
probs_val = predict(X_val, W, b)
loss_val = cross_entropy_loss::forward(probs_val, Y_val)
accuracy_val = mean(rowIndexMax(probs_val) == rowIndexMax(Y_val))
# Output results
print("Epoch: " + e + "/" + epochs + ", Iter: " + i + "/" + iters
+ ", Train Loss: " + loss + ", Train Accuracy: " + accuracy + ", Val Loss: "
+ loss_val + ", Val Accuracy: " + accuracy_val + ", lr: " + lr + ", mu " + mu)
}
}
# Anneal momentum towards 0.999
mu = mu + (0.999 - mu)/(1+epochs-e)
# Decay learning rate
lr = lr * decay
}
}
predict = function(matrix[double] X, matrix[double] W, matrix[double] b)
return (matrix[double] probs) {
/*
* Computes the class probability predictions of a softmax classifier.
*
* The input matrix, X, has N examples, each with D features.
*
* Inputs:
* - X: Input data matrix, of shape (N, D).
* - W: Weights (parameters) matrix, of shape (D, K).
* - b: Biases vector, of shape (1, K).
*
* Outputs:
* - probs: Class probabilities, of shape (N, K).
*/
N = nrow(X) # num examples
K = ncol(W) # num classes
# Compute forward pass
## affine & softmax:
out = affine::forward(X, W, b)
probs = softmax::forward(out)
}
eval = function(matrix[double] probs, matrix[double] Y)
return (double loss, double accuracy) {
/*
* Evaluates a softmax classifier.
*
* The probs matrix contains the class probability predictions
* of K classes over N examples. The targets, Y, have K classes,
* and are one-hot encoded.
*
* Inputs:
* - probs: Class probabilities, of shape (N, K).
* - Y: Target matrix, of shape (N, K).
*
* Outputs:
* - loss: Scalar loss, of shape (1).
* - accuracy: Scalar accuracy, of shape (1).
*/
# Compute loss & accuracy
loss = cross_entropy_loss::forward(probs, Y)
correct_pred = rowIndexMax(probs) == rowIndexMax(Y)
accuracy = mean(correct_pred)
}
generate_dummy_data = function()
return (matrix[double] X, matrix[double] Y, int C, int Hin, int Win) {
/*
* Generate a dummy dataset similar to the breast cancer dataset.
*
* Outputs:
* - X: Input data matrix, of shape (N, D).
* - Y: Target matrix, of shape (N, K).
* - C: Number of input channels (dimensionality of input depth).
* - Hin: Input height.
* - Win: Input width.
*/
# Generate dummy input data
N = 1024 # num examples
C = 3 # num input channels
Hin = 256 # input height
Win = 256 # input width
T = 10 # num targets
X = rand(rows=N, cols=C*Hin*Win, pdf="normal")
classes = round(rand(rows=N, cols=1, min=1, max=T, pdf="uniform"))
Y = table(seq(1, N), classes) # one-hot encoding
}