| #------------------------------------------------------------- |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| #------------------------------------------------------------- |
| # |
| # This builtin function trains simple feed-forward neural network. The architecture of the |
| # networks is: affine1 -> relu -> dropout -> affine2 -> configurable output activation function. |
| # |
| # INPUT PARAMETERS: |
| # -------------------------------------------------------------------------------------------- |
| # NAME TYPE DEFAULT MEANING |
| # -------------------------------------------------------------------------------------------- |
| # X Matrix[double] --- Training data |
| # Y Matrix[Double] --- Labels/Target values |
| # batch_size Integer 64 Batch size |
| # epochs Integer 20 Number of epochs |
| # learning_rate Double 0.003 Learning rate |
| # out_activation String --- User specified ouptut activation function. Possible values: |
| # "sigmoid", "relu", "lrelu", "tanh", "softmax", "logits" (no activation). |
| # loss_fcn String --- User specified loss function. Possible values: |
| # "l1", "l2", "log_loss", "logcosh_loss", "cel" (cross-entropy loss). |
| # shuffle Boolean FALSE Flag which indicates if dataset should be shuffled or not |
| # validation_split Double 0.0 Fraction of training set used as validation set |
| # seed Integer -1 Seed for model initialization |
| # verbose Boolean FALSE Flag which indicates if function should print to stdout |
| # -------------------------------------------------------------------------------------------- |
| # OUTPUT: |
| # model List[unknown] --- Trained model which can be used in ffPredict |
| # |
| |
| source("nn/layers/affine.dml") as affine |
| source("nn/layers/dropout.dml") as dropout |
| source("nn/layers/feedForward.dml") as ff_pass |
| |
| # Supported loss functions by the model |
| source("nn/layers/l1_loss.dml") as l1_loss |
| source("nn/layers/l2_loss.dml") as l2_loss |
| source("nn/layers/log_loss.dml") as log_loss |
| source("nn/layers/logcosh_loss.dml") as logcosh_loss |
| source("nn/layers/cross_entropy_loss.dml") as cel |
| |
| # Supported activation functions by the model |
| source("nn/layers/sigmoid.dml") as sigmoid |
| source("nn/layers/relu.dml") as relu |
| source("nn/layers/leaky_relu.dml") as lrelu |
| source("nn/layers/tanh.dml") as tanh |
| source("nn/layers/softmax.dml") as softmax |
| |
| source("nn/optim/sgd_nesterov.dml") as sgd_nesterov |
| |
| m_ffTrain = function(Matrix[double] X, Matrix[double] Y, Integer batch_size=64, |
| Integer epochs=20, Double learning_rate=0.003, String out_activation, |
| String loss_fcn, Boolean shuffle=FALSE, Double validation_split = 0.0, |
| Integer seed=-1, Boolean verbose=FALSE) |
| return (List[unknown] model) |
| { |
| |
| N = nrow(X) # number of samples |
| D = ncol(X) # number of features |
| t = ncol(Y) # number of targets |
| |
| if(shuffle) { |
| [X, Y] = shuffle(X, Y) |
| } |
| |
| validation = FALSE |
| if(validation_split > 0.0) { |
| validation = TRUE |
| [X_train, Y_train, X_val, Y_val] = val_split(X, Y, validation_split) |
| N = nrow(X_train) |
| } else { |
| X_train = X |
| Y_train = Y |
| } |
| |
| H1 = 128 # number of layer1 neurons |
| |
| # Init layers |
| [W1, b1] = affine::init(D, H1, seed) |
| [W2, b2] = affine::init(H1, t, seed) |
| |
| # Initialize SGD |
| lr = learning_rate |
| mu = 0 |
| decay = 0.99 |
| vW1 = sgd_nesterov::init(W1) |
| vb1 = sgd_nesterov::init(b1) |
| vW2 = sgd_nesterov::init(W2) |
| vb2 = sgd_nesterov::init(b2) |
| |
| iters = ceil(N / batch_size) |
| |
| batch = batch_size |
| for (e in 1:epochs) { |
| loss = 0 |
| val_loss = 0 |
| for(i in 1:iters) { |
| |
| begin = (i-1)*batch+1 |
| end = min(N, begin + batch - 1) |
| X_batch = X_train[begin:end,] |
| Y_batch = Y_train[begin:end,] |
| |
| # Output activation function is stored in the list together |
| # with the layers since different activation functions might |
| # be specified (or none). When forward/backward pass is |
| # performed, member "activation" of the list model is used |
| # such that appropriate functions are applied. This is |
| # advantage for the user which does not have to pass model |
| # and activation function as two arguments in predict method. |
| layers = list(W1=W1, b1=b1, W2=W2, b2=b2, activation=out_activation) |
| cache = ff_pass::feedForward(X=X_batch, layers=layers) |
| |
| # Distinguish two cases when loss is calculated from the raw output |
| # or from the output of the activation function. |
| if (out_activation != "logits") { |
| loss = loss + loss_forward(as.matrix(cache["outs2"]), Y_batch, loss_fcn) |
| dout2 = loss_backward(as.matrix(cache["outs2"]), Y_batch, loss_fcn) |
| } else { |
| loss = loss + loss_forward(as.matrix(cache["out2"]), Y_batch, loss_fcn) |
| dout2 = loss_backward(as.matrix(cache["out2"]), Y_batch, loss_fcn) |
| } |
| |
| [dW1, db1, dW2, db2] = feed_backward(X_batch, layers, cache, dout2) |
| |
| [W2, vW2] = sgd_nesterov::update(W2, dW2, lr, mu, vW2) |
| [b2, vb2] = sgd_nesterov::update(b2, db2, lr, mu, vb2) |
| [W1, vW1] = sgd_nesterov::update(W1, dW1, lr, mu, vW1) |
| [b1, vb1] = sgd_nesterov::update(b1, db1, lr, mu, vb1) |
| |
| if(validation) { |
| cache = ff_pass::feedForward(X=X_val, layers=layers) |
| if (out_activation != "logits") |
| val_loss = val_loss + loss_forward(as.matrix(cache["outs2"]), Y_val, loss_fcn) |
| else |
| val_loss = val_loss + loss_forward(as.matrix(cache["out2"]), Y_val, loss_fcn) |
| } |
| } |
| |
| mu = mu + (0.999 - mu)/(1+epochs-e) |
| lr = lr * decay |
| |
| if(validation) { |
| print("Epoch: " + e + ", Train loss: " + loss/iters + " Validation loss: " + val_loss/iters) |
| } else if(verbose) { |
| print("Epoch: " + e + ", Train loss: " + loss/iters) |
| } |
| } |
| model = list(W1=W1, b1=b1, W2=W2, b2=b2, activation=out_activation) |
| } |
| |
| |
| feed_backward = function(Matrix[double] X, List[unknown] layers, List[unknown] cache, Matrix[double] dout) |
| return(Matrix[double] dW1, Matrix[double] db1, Matrix[double] dW2, Matrix[double] db2) |
| { |
| p = 0.35 # dropout probability |
| |
| if (as.scalar(layers["activation"]) != "logits") |
| dout = apply_activation_backward(dout, as.matrix(cache["out2"]), as.scalar(layers["activation"])) |
| # Layer 2 |
| [doutd1, dW2, db2] = affine::backward(dout, as.matrix(cache["outd1"]), as.matrix(layers["W2"]), as.matrix(layers["b2"])) |
| # Layer 1 |
| doutr1 = dropout::backward(doutd1, as.matrix(cache["outr1"]), p, as.matrix(cache["maskd1"])) |
| dout1 = relu::backward(doutr1, as.matrix(cache["out1"])) |
| [dx, dW1, db1] = affine::backward(dout1, X, as.matrix(layers["W1"]), as.matrix(layers["b1"])) |
| } |
| |
| apply_activation_backward = function(Matrix[double] dout, Matrix[double] X, String activation) |
| return (Matrix[double] out) |
| { |
| if(activation == "sigmoid") { |
| out = sigmoid::backward(dout, X) |
| } else if (activation == "relu") { |
| out = relu::backward(dout, X) |
| } else if (activation == "lrelu") { |
| out = lrelu::backward(dout, X) |
| } else if (activation == "tanh") { |
| out = tanh::backward(dout, X) |
| } else if (activation == "softmax") { |
| out = softmax::backward(dout, X) |
| } |
| } |
| |
| loss_forward = function(Matrix[double] prediction, Matrix[double] target, String loss_fcn) |
| return(Double loss) |
| { |
| if (loss_fcn == "l1") { |
| loss = l1_loss::forward(prediction, target) |
| } else if(loss_fcn == "l2") { |
| loss = l2_loss::forward(prediction, target) |
| } else if(loss_fcn == "log_loss") { |
| loss = log_loss::forward(prediction, target) |
| } else if(loss_fcn == "logcosh_loss") { |
| loss = logcosh_loss::forward(prediction, target) |
| } else { |
| loss = cel::forward(prediction, target) |
| } |
| } |
| |
| loss_backward = function(Matrix[double] prediction, Matrix[double] target, String loss_fcn) |
| return(Matrix[Double] dout) |
| { |
| if (loss_fcn == "l1") { |
| dout = l1_loss::backward(prediction, target) |
| } else if(loss_fcn == "l2") { |
| dout = l2_loss::backward(prediction, target) |
| } else if(loss_fcn == "log_loss") { |
| dout = log_loss::backward(prediction, target) |
| } else if(loss_fcn == "logcosh_loss") { |
| dout = logcosh_loss::backward(prediction, target) |
| } else { |
| dout = cel::backward(prediction, target) |
| } |
| } |
| |
| shuffle = function(Matrix[double] X, Matrix[double] Y) |
| return(Matrix[Double] X_new, Matrix[Double] Y_new) |
| { |
| X_col = ncol(X) |
| Y_col = ncol(Y) |
| ord = rand(rows=nrow(X), cols=1, min=0, max=1, pdf="uniform") |
| shuffled = order(target = cbind(X, Y, ord), by = X_col + Y_col + 1) |
| |
| X_new = shuffled[,1:X_col] |
| Y_new = shuffled[,X_col + 1 : X_col + Y_col] |
| } |
| |
| val_split = function(Matrix[double] X, Matrix[double] Y, Double split) |
| return(Matrix[double] X_train, Matrix[double] Y_train, |
| Matrix[double] X_val, Matrix[double] Y_val) |
| { |
| N = nrow(X) |
| val_start = round(N * split) |
| X_train = X[1:N - val_start,] |
| Y_train = Y[1:N - val_start,] |
| X_val = X[N - val_start + 1:N,] |
| Y_val = Y[N - val_start + 1:N,] |
| } |