blob: 1307e91a1ed20cd40e218cfa3de4cf15db9309e2 [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
/*
* MNIST 2NN Leaky Relu Example
*/
# Imports
source("nn/layers/affine.dml") as affine
source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
source("nn/layers/leaky_relu.dml") as leaky_relu
source("nn/layers/softmax.dml") as softmax
source("nn/optim/sgd_nesterov.dml") as sgd_nesterov
train = function(matrix[double] X, matrix[double] Y, matrix[double] X_val,
matrix[double] Y_val, int epochs)
return (matrix[double] W_1, matrix[double] b_1, matrix[double] W_2,
matrix[double] b_2, matrix[double] W_3, matrix[double] b_3)
{
/*
* Trains a 2 hidden layer leaky relu softmax classifier.
*
* The input matrix, X, has N examples, each with D features.
* The targets, Y, have K classes, and are one-hot encoded.
*
* Inputs:
* - X: Input data matrix, of shape (N, D).
* - Y: Target matrix, of shape (N, K).
* - X_val: Input validation data matrix, of shape (N, C*Hin*Win).
* - Y_val: Target validation matrix, of shape (N, K).
* - epochs: Total number of full training loops over the full data set.
*
* Outputs:
* - W: Weights (parameters) matrix, of shape (D, M, 3).
* - b: Biases vector, of shape (1, M, 3).
*/
N = nrow(X) # num examples
D = ncol(X) # num features
K = ncol(Y) # num classes
# Create the network:
# input -> 200 neuron affine -> leaky_relu -> 200 neuron affine -> leaky_relu -> K neurons affine -> softmax
[W_1, b_1] = affine::init(D, 200)
[W_2, b_2] = affine::init(200, 200)
[W_3, b_3] = affine::init(200, K)
# Initialize SGD
lr = 0.2 # learning rate
mu = 0 # momentum
decay = 0.99 # learning rate decay constant
vW_1 = sgd_nesterov::init(W_1) # optimizer momentum state for W_1
vb_1 = sgd_nesterov::init(b_1) # optimizer momentum state for b_1
vW_2 = sgd_nesterov::init(W_2) # optimizer momentum state for W_2
vb_2 = sgd_nesterov::init(b_2) # optimizer momentum state for b_2
vW_3 = sgd_nesterov::init(W_3) # optimizer momentum state for W_3
vb_3 = sgd_nesterov::init(b_3) # optimizer momentum state for b_3
# Optimize
print("Starting optimization")
batch_size = 50
iters = 1000
for (e in 1:epochs) {
for(i in 1:iters) {
# Get next batch
beg = ((i-1) * batch_size) %% N + 1
end = min(N, beg + batch_size - 1)
X_batch = X[beg:end,]
y_batch = Y[beg:end,]
# Compute forward pass
## input D -> 200 neuron affine -> leaky_relu -> 200 neuron affine -> leaky_relu -> K neurons affine -> softmax
out_1 = affine::forward(X_batch, W_1, b_1)
out_1_leaky_relu = leaky_relu::forward(out_1)
out_2 = affine::forward(out_1_leaky_relu, W_2, b_2)
out_2_leaky_relu = leaky_relu::forward(out_2)
out_3 = affine::forward(out_2_leaky_relu, W_3, b_3)
probs = softmax::forward(out_3)
# Compute loss & accuracy for training & validation data
loss = cross_entropy_loss::forward(probs, y_batch)
accuracy = mean(rowIndexMax(probs) == rowIndexMax(y_batch))
probs_val = predict(X_val, W_1, b_1, W_2, b_2, W_3, b_3)
loss_val = cross_entropy_loss::forward(probs_val, Y_val)
accuracy_val = mean(rowIndexMax(probs_val) == rowIndexMax(Y_val))
print("Epoch: " + e + ", Iter: " + i + ", Train Loss: " + loss + ", Train Accuracy: " +
accuracy + ", Val Loss: " + loss_val + ", Val Accuracy: " + accuracy_val)
# Compute backward pass
## loss:
dprobs = cross_entropy_loss::backward(probs, y_batch)
dout_3 = softmax::backward(dprobs, out_3)
[dout_2_leaky_relu, dW_3, db_3] = affine::backward(dout_3, out_2_leaky_relu, W_3, b_3)
dout_2 = leaky_relu::backward(dout_2_leaky_relu, out_2)
[dout_1_leaky_relu, dW_2, db_2] = affine::backward(dout_2, out_1_leaky_relu, W_2, b_2)
dout_1 = leaky_relu::backward(dout_1_leaky_relu, out_1)
[dX_batch, dW_1, db_1] = affine::backward(dout_1, X_batch, W_1, b_1)
# Optimize with SGD
[W_3, vW_3] = sgd_nesterov::update(W_3, dW_3, lr, mu, vW_3)
[b_3, vb_3] = sgd_nesterov::update(b_3, db_3, lr, mu, vb_3)
[W_2, vW_2] = sgd_nesterov::update(W_2, dW_2, lr, mu, vW_2)
[b_2, vb_2] = sgd_nesterov::update(b_2, db_2, lr, mu, vb_2)
[W_1, vW_1] = sgd_nesterov::update(W_1, dW_1, lr, mu, vW_1)
[b_1, vb_1] = sgd_nesterov::update(b_1, db_1, lr, mu, vb_1)
}
# Anneal momentum towards 0.999
mu = mu + (0.999 - mu)/(1+epochs-e)
# Decay learning rate
lr = lr * decay
}
}
predict = function(matrix[double] X,
matrix[double] W_1, matrix[double] b_1,
matrix[double] W_2, matrix[double] b_2,
matrix[double] W_3, matrix[double] b_3)
return (matrix[double] probs) {
/*
* Computes the class probability predictions of a softmax classifier.
*
* The input matrix, X, has N examples, each with D features.
*
* Inputs:
* - X: Input data matrix, of shape (N, D).
* - W: Weights (parameters) matrix, of shape (D, M).
* - b: Biases vector, of shape (1, M).
*
* Outputs:
* - probs: Class probabilities, of shape (N, K).
*/
# Compute forward pass
## input -> 200 neuron affine -> leaky_relu -> 200 neuron affine -> leaky_relu -> K neurons affine -> softmax
out_1_leaky_relu = leaky_relu::forward(affine::forward(X, W_1, b_1))
out_2_leaky_relu = leaky_relu::forward(affine::forward(out_1_leaky_relu, W_2, b_2))
probs = softmax::forward(affine::forward(out_2_leaky_relu, W_3, b_3))
}
eval = function(matrix[double] probs, matrix[double] Y)
return (double loss, double accuracy) {
/*
* Evaluates a classifier.
*
* The probs matrix contains the class probability predictions
* of K classes over N examples. The targets, Y, have K classes,
* and are one-hot encoded.
*
* Inputs:
* - probs: Class probabilities, of shape (N, K).
* - Y: Target matrix, of shape (N, K).
*
* Outputs:
* - loss: Scalar loss, of shape (1).
* - accuracy: Scalar accuracy, of shape (1).
*/
# Compute loss & accuracy
loss = cross_entropy_loss::forward(probs, Y)
correct_pred = rowIndexMax(probs) == rowIndexMax(Y)
accuracy = mean(correct_pred)
}