| #------------------------------------------------------------- |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| #------------------------------------------------------------- |
| |
| /* |
| * Breast Cancer Softmax Model |
| */ |
| # Imports |
| source("nn/layers/affine.dml") as affine |
| source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss |
| source("nn/layers/softmax.dml") as softmax |
| #source("nn/optim/adam.dml") as adam |
| source("nn/optim/sgd_nesterov.dml") as sgd_nesterov |
| |
| train = function(matrix[double] X, matrix[double] Y, |
| matrix[double] X_val, matrix[double] Y_val, |
| double lr, double mu, double decay, |
| int batch_size, int epochs, int log_interval) |
| return (matrix[double] W, matrix[double] b) { |
| /* |
| * Trains a softmax classifier. |
| * |
| * The input matrix, X, has N examples, each with D features. |
| * The targets, Y, have K classes, and are one-hot encoded. |
| * |
| * Inputs: |
| * - X: Input data matrix, of shape (N, D). |
| * - Y: Target matrix, of shape (N, K). |
| * - X_val: Input validation data matrix, of shape (N, C*Hin*Win). |
| * - Y_val: Target validation matrix, of shape (N, K). |
| * - lr: Learning rate. |
| * - mu: Momentum value. |
| * Typical values are in the range of [0.5, 0.99], usually |
| * started at the lower end and annealed towards the higher end. |
| * - decay: Learning rate decay rate. |
| * - batch_size: Size of mini-batches to train on. |
| * - epochs: Total number of full training loops over the full data set. |
| * - log_interval: Interval, in iterations, between log outputs. |
| * |
| * Outputs: |
| * - W: Weights (parameters) matrix, of shape (D, K). |
| * - b: Biases vector, of shape (1, K). |
| */ |
| N = nrow(Y) # num examples |
| D = ncol(X) # num features |
| K = ncol(Y) # num classes |
| |
| # Create softmax classifier: |
| # affine -> softmax |
| [W, b] = affine::init(D, K) |
| W = W / sqrt(2.0/(D)) * sqrt(1/(D)) |
| |
| # Initialize SGD w/ Nesterov momentum optimizer |
| vW = sgd_nesterov::init(W) # optimizer momentum state for W |
| vb = sgd_nesterov::init(b) # optimizer momentum state for b |
| #[mW, vW] = adam::init(W) # optimizer 1st & 2nd moment state for W |
| #[mb, vb] = adam::init(b) # optimizer 1st & 2nd moment state for b |
| |
| # Starting validation loss & accuracy |
| probs_val = predict(X_val, W, b) |
| loss_val = cross_entropy_loss::forward(probs_val, Y_val) |
| accuracy_val = mean(rowIndexMax(probs_val) == rowIndexMax(Y_val)) |
| # Output results |
| print("Start: Val Loss: " + loss_val + ", Val Accuracy: " + accuracy_val) |
| |
| # Optimize |
| print("Starting optimization") |
| iters = ceil(N / batch_size) |
| for (e in 1:epochs) { |
| for(i in 1:iters) { |
| # Get next batch |
| beg = ((i-1) * batch_size) %% N + 1 |
| end = min(N, beg + batch_size - 1) |
| #print("Epoch: " + e + ", Iter: " + i + ", X[" + beg + ":" + end + ",]") |
| X_batch = X[beg:end,] |
| Y_batch = Y[beg:end,] |
| |
| # Compute forward pass |
| ## affine & softmax: |
| out = affine::forward(X_batch, W, b) |
| probs = softmax::forward(out) |
| |
| # Compute backward pass |
| ## loss: |
| dprobs = cross_entropy_loss::backward(probs, Y_batch) |
| ## affine & softmax: |
| dout = softmax::backward(dprobs, out) |
| [dX_batch, dW, db] = affine::backward(dout, X_batch, W, b) |
| |
| # Optimize with SGD w/ Nesterov momentum |
| [W, vW] = sgd_nesterov::update(W, dW, lr, mu, vW) |
| [b, vb] = sgd_nesterov::update(b, db, lr, mu, vb) |
| #[W, mW, vW] = adam::update(W, dW, lr, 0.9, 0.999, 1e-8, e*i-1, mW, vW) |
| #[b, mb, vb] = adam::update(b, db, lr, 0.9, 0.999, 1e-8, e*i-1, mb, vb) |
| |
| # Compute loss & accuracy for training & validation data every `log_interval` iterations. |
| if (i %% log_interval == 0) { |
| #print("Eval time! - i: " + i) |
| # Compute training loss & accuracy |
| loss = cross_entropy_loss::forward(probs, Y_batch) |
| accuracy = mean(rowIndexMax(probs) == rowIndexMax(Y_batch)) |
| |
| # Compute validation loss & accuracy |
| probs_val = predict(X_val, W, b) |
| loss_val = cross_entropy_loss::forward(probs_val, Y_val) |
| accuracy_val = mean(rowIndexMax(probs_val) == rowIndexMax(Y_val)) |
| |
| # Output results |
| print("Epoch: " + e + "/" + epochs + ", Iter: " + i + "/" + iters |
| + ", Train Loss: " + loss + ", Train Accuracy: " + accuracy + ", Val Loss: " |
| + loss_val + ", Val Accuracy: " + accuracy_val + ", lr: " + lr + ", mu " + mu) |
| } |
| } |
| # Anneal momentum towards 0.999 |
| mu = mu + (0.999 - mu)/(1+epochs-e) |
| # Decay learning rate |
| lr = lr * decay |
| } |
| } |
| |
| predict = function(matrix[double] X, matrix[double] W, matrix[double] b) |
| return (matrix[double] probs) { |
| /* |
| * Computes the class probability predictions of a softmax classifier. |
| * |
| * The input matrix, X, has N examples, each with D features. |
| * |
| * Inputs: |
| * - X: Input data matrix, of shape (N, D). |
| * - W: Weights (parameters) matrix, of shape (D, K). |
| * - b: Biases vector, of shape (1, K). |
| * |
| * Outputs: |
| * - probs: Class probabilities, of shape (N, K). |
| */ |
| N = nrow(X) # num examples |
| K = ncol(W) # num classes |
| |
| # Compute forward pass |
| ## affine & softmax: |
| out = affine::forward(X, W, b) |
| probs = softmax::forward(out) |
| } |
| |
| eval = function(matrix[double] probs, matrix[double] Y) |
| return (double loss, double accuracy) { |
| /* |
| * Evaluates a softmax classifier. |
| * |
| * The probs matrix contains the class probability predictions |
| * of K classes over N examples. The targets, Y, have K classes, |
| * and are one-hot encoded. |
| * |
| * Inputs: |
| * - probs: Class probabilities, of shape (N, K). |
| * - Y: Target matrix, of shape (N, K). |
| * |
| * Outputs: |
| * - loss: Scalar loss, of shape (1). |
| * - accuracy: Scalar accuracy, of shape (1). |
| */ |
| # Compute loss & accuracy |
| loss = cross_entropy_loss::forward(probs, Y) |
| correct_pred = rowIndexMax(probs) == rowIndexMax(Y) |
| accuracy = mean(correct_pred) |
| } |
| |
| generate_dummy_data = function() |
| return (matrix[double] X, matrix[double] Y, int C, int Hin, int Win) { |
| /* |
| * Generate a dummy dataset similar to the breast cancer dataset. |
| * |
| * Outputs: |
| * - X: Input data matrix, of shape (N, D). |
| * - Y: Target matrix, of shape (N, K). |
| * - C: Number of input channels (dimensionality of input depth). |
| * - Hin: Input height. |
| * - Win: Input width. |
| */ |
| # Generate dummy input data |
| N = 1024 # num examples |
| C = 3 # num input channels |
| Hin = 256 # input height |
| Win = 256 # input width |
| T = 10 # num targets |
| X = rand(rows=N, cols=C*Hin*Win, pdf="normal") |
| classes = round(rand(rows=N, cols=1, min=1, max=T, pdf="uniform")) |
| Y = table(seq(1, N), classes) # one-hot encoding |
| } |
| |