| #------------------------------------------------------------- |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| #------------------------------------------------------------- |
| |
| /* |
| * This file implements all needed functions to evaluate a convolutional neural network of the "LeNet" architecture |
| * on different execution schemes and with different inputs, for example a federated input matrix. |
| */ |
| |
| # Imports |
| source("scripts/nn/layers/affine.dml") as affine |
| source("scripts/nn/layers/conv2d_builtin.dml") as conv2d |
| source("scripts/nn/layers/cross_entropy_loss.dml") as cross_entropy_loss |
| source("scripts/nn/layers/dropout.dml") as dropout |
| source("scripts/nn/layers/l2_reg.dml") as l2_reg |
| source("scripts/nn/layers/max_pool2d_builtin.dml") as max_pool2d |
| source("scripts/nn/layers/relu.dml") as relu |
| source("scripts/nn/layers/softmax.dml") as softmax |
| source("scripts/nn/optim/sgd_nesterov.dml") as sgd_nesterov |
| |
| /* |
| * Trains a convolutional net using the "LeNet" architectur single threaded the conventional way. |
| * |
| * The input matrix, X, has N examples, each represented as a 3D |
| * volume unrolled into a single vector. The targets, Y, have K |
| * classes, and are one-hot encoded. |
| * |
| * Inputs: |
| * - X: Input data matrix, of shape (N, C*Hin*Win) |
| * - y: Target matrix, of shape (N, K) |
| * - X_val: Input validation data matrix, of shape (N, C*Hin*Win) |
| * - y_val: Target validation matrix, of shape (N, K) |
| * - C: Number of input channels (dimensionality of input depth) |
| * - Hin: Input height |
| * - Win: Input width |
| * - epochs: Total number of full training loops over the full data set |
| * - batch_size: Batch size |
| * - learning_rate: The learning rate for the SGD |
| * |
| * Outputs: |
| * - model_trained: List containing |
| * - W1: 1st layer weights (parameters) matrix, of shape (F1, C*Hf*Wf) |
| * - b1: 1st layer biases vector, of shape (F1, 1) |
| * - W2: 2nd layer weights (parameters) matrix, of shape (F2, F1*Hf*Wf) |
| * - b2: 2nd layer biases vector, of shape (F2, 1) |
| * - W3: 3rd layer weights (parameters) matrix, of shape (F2*(Hin/4)*(Win/4), N3) |
| * - b3: 3rd layer biases vector, of shape (1, N3) |
| * - W4: 4th layer weights (parameters) matrix, of shape (N3, K) |
| * - b4: 4th layer biases vector, of shape (1, K) |
| */ |
| train = function(matrix[double] X, matrix[double] y, |
| matrix[double] X_val, matrix[double] y_val, |
| int C, int Hin, int Win, int epochs, int batch_size, double learning_rate) |
| return (list[unknown] model_trained) { |
| |
| N = nrow(X) |
| K = ncol(y) |
| |
| # Create network: |
| ## input -> conv1 -> relu1 -> pool1 -> conv2 -> relu2 -> pool2 -> affine3 -> relu3 -> affine4 -> softmax |
| Hf = 5 # filter height |
| Wf = 5 # filter width |
| stride = 1 |
| pad = 2 # For same dimensions, (Hf - stride) / 2 |
| F1 = 32 # num conv filters in conv1 |
| F2 = 64 # num conv filters in conv2 |
| N3 = 512 # num nodes in affine3 |
| # Note: affine4 has K nodes, which is equal to the number of target dimensions (num classes) |
| |
| [W1, b1] = conv2d::init(F1, C, Hf, Wf) # inputs: (N, C*Hin*Win) |
| [W2, b2] = conv2d::init(F2, F1, Hf, Wf) # inputs: (N, F1*(Hin/2)*(Win/2)) |
| [W3, b3] = affine::init(F2*(Hin/2/2)*(Win/2/2), N3) # inputs: (N, F2*(Hin/2/2)*(Win/2/2)) |
| [W4, b4] = affine::init(N3, K) # inputs: (N, N3) |
| W4 = W4 / sqrt(2) # different initialization, since being fed into softmax, instead of relu |
| |
| # Initialize SGD w/ Nesterov momentum optimizer |
| learning_rate = learning_rate # learning rate |
| mu = 0.9 # momentum |
| decay = 0.95 # learning rate decay constant |
| vW1 = sgd_nesterov::init(W1); vb1 = sgd_nesterov::init(b1) |
| vW2 = sgd_nesterov::init(W2); vb2 = sgd_nesterov::init(b2) |
| vW3 = sgd_nesterov::init(W3); vb3 = sgd_nesterov::init(b3) |
| vW4 = sgd_nesterov::init(W4); vb4 = sgd_nesterov::init(b4) |
| # Regularization |
| lambda = 5e-04 |
| |
| # Create the hyper parameter list |
| hyperparams = list(learning_rate=learning_rate, mu=mu, decay=decay, C=C, Hin=Hin, Win=Win, Hf=Hf, Wf=Wf, stride=stride, pad=pad, lambda=lambda, F1=F1, F2=F2, N3=N3) |
| # Calculate iterations |
| iters = ceil(N / batch_size) |
| print_interval = floor(iters / 25) |
| |
| print("[+] Starting optimization") |
| print("[+] Learning rate: " + learning_rate) |
| print("[+] Batch size: " + batch_size) |
| print("[+] Iterations per epoch: " + iters + "\n") |
| |
| for (e in 1:epochs) { |
| print("[+] Starting epoch: " + e) |
| print("|") |
| for(i in 1:iters) { |
| # Create the model list |
| model_list = list(W1, W2, W3, W4, b1, b2, b3, b4, vW1, vW2, vW3, vW4, vb1, vb2, vb3, vb4) |
| |
| # Get next batch |
| beg = ((i-1) * batch_size) %% N + 1 |
| end = min(N, beg + batch_size - 1) |
| X_batch = X[beg:end,] |
| y_batch = y[beg:end,] |
| |
| gradients_list = gradients(model_list, hyperparams, X_batch, y_batch) |
| model_updated = aggregation(model_list, hyperparams, gradients_list) |
| |
| W1 = as.matrix(model_updated[1]) |
| W2 = as.matrix(model_updated[2]) |
| W3 = as.matrix(model_updated[3]) |
| W4 = as.matrix(model_updated[4]) |
| b1 = as.matrix(model_updated[5]) |
| b2 = as.matrix(model_updated[6]) |
| b3 = as.matrix(model_updated[7]) |
| b4 = as.matrix(model_updated[8]) |
| vW1 = as.matrix(model_updated[9]) |
| vW2 = as.matrix(model_updated[10]) |
| vW3 = as.matrix(model_updated[11]) |
| vW4 = as.matrix(model_updated[12]) |
| vb1 = as.matrix(model_updated[13]) |
| vb2 = as.matrix(model_updated[14]) |
| vb3 = as.matrix(model_updated[15]) |
| vb4 = as.matrix(model_updated[16]) |
| if((i %% print_interval) == 0) { |
| print("█") |
| } |
| } |
| print("|") |
| } |
| |
| model_trained = list(W1, W2, W3, W4, b1, b2, b3, b4, vW1, vW2, vW3, vW4, vb1, vb2, vb3, vb4) |
| } |
| |
| /* |
| * Trains a convolutional net using the "LeNet" architecture using a parameter server with specified properties. |
| * |
| * The input matrix, X, has N examples, each represented as a 3D |
| * volume unrolled into a single vector. The targets, Y, have K |
| * classes, and are one-hot encoded. |
| * |
| * Inputs: |
| * - X: Input data matrix, of shape (N, C*Hin*Win) |
| * - Y: Target matrix, of shape (N, K) |
| * - X_val: Input validation data matrix, of shape (N, C*Hin*Win) |
| * - Y_val: Target validation matrix, of shape (N, K) |
| * - C: Number of input channels (dimensionality of input depth) |
| * - Hin: Input height |
| * - Win: Input width |
| * - epochs: Total number of full training loops over the full data set |
| * - batch_size: Batch size |
| * - learning_rate: The learning rate for the SGD |
| * - workers: Number of workers to create |
| * - utype: parameter server framework to use |
| * - scheme: update schema |
| * - mode: local or distributed |
| * |
| * Outputs: |
| * - model_trained: List containing |
| * - W1: 1st layer weights (parameters) matrix, of shape (F1, C*Hf*Wf) |
| * - b1: 1st layer biases vector, of shape (F1, 1) |
| * - W2: 2nd layer weights (parameters) matrix, of shape (F2, F1*Hf*Wf) |
| * - b2: 2nd layer biases vector, of shape (F2, 1) |
| * - W3: 3rd layer weights (parameters) matrix, of shape (F2*(Hin/4)*(Win/4), N3) |
| * - b3: 3rd layer biases vector, of shape (1, N3) |
| * - W4: 4th layer weights (parameters) matrix, of shape (N3, K) |
| * - b4: 4th layer biases vector, of shape (1, K) |
| */ |
| train_paramserv = function(matrix[double] X, matrix[double] y, |
| matrix[double] X_val, matrix[double] y_val, |
| int C, int Hin, int Win, int epochs, int workers, |
| string utype, string freq, int batch_size, string scheme, string mode, double learning_rate) |
| return (list[unknown] model_trained) { |
| |
| N = nrow(X) |
| K = ncol(y) |
| |
| # Create network: |
| ## input -> conv1 -> relu1 -> pool1 -> conv2 -> relu2 -> pool2 -> affine3 -> relu3 -> affine4 -> softmax |
| Hf = 5 # filter height |
| Wf = 5 # filter width |
| stride = 1 |
| pad = 2 # For same dimensions, (Hf - stride) / 2 |
| F1 = 32 # num conv filters in conv1 |
| F2 = 64 # num conv filters in conv2 |
| N3 = 512 # num nodes in affine3 |
| # Note: affine4 has K nodes, which is equal to the number of target dimensions (num classes) |
| |
| [W1, b1] = conv2d::init(F1, C, Hf, Wf) # inputs: (N, C*Hin*Win) |
| [W2, b2] = conv2d::init(F2, F1, Hf, Wf) # inputs: (N, F1*(Hin/2)*(Win/2)) |
| [W3, b3] = affine::init(F2*(Hin/2/2)*(Win/2/2), N3) # inputs: (N, F2*(Hin/2/2)*(Win/2/2)) |
| [W4, b4] = affine::init(N3, K) # inputs: (N, N3) |
| W4 = W4 / sqrt(2) # different initialization, since being fed into softmax, instead of relu |
| |
| # Initialize SGD w/ Nesterov momentum optimizer |
| learning_rate = learning_rate # learning rate |
| mu = 0.9 # momentum |
| decay = 0.95 # learning rate decay constant |
| vW1 = sgd_nesterov::init(W1); vb1 = sgd_nesterov::init(b1) |
| vW2 = sgd_nesterov::init(W2); vb2 = sgd_nesterov::init(b2) |
| vW3 = sgd_nesterov::init(W3); vb3 = sgd_nesterov::init(b3) |
| vW4 = sgd_nesterov::init(W4); vb4 = sgd_nesterov::init(b4) |
| # Regularization |
| lambda = 5e-04 |
| # Create the model list |
| model_list = list(W1, W2, W3, W4, b1, b2, b3, b4, vW1, vW2, vW3, vW4, vb1, vb2, vb3, vb4) |
| # Create the hyper parameter list |
| params = list(learning_rate=learning_rate, mu=mu, decay=decay, C=C, Hin=Hin, Win=Win, Hf=Hf, Wf=Wf, stride=stride, pad=pad, lambda=lambda, F1=F1, F2=F2, N3=N3) |
| |
| # Use paramserv function |
| model_trained = paramserv(model=model_list, features=X, labels=y, val_features=X_val, val_labels=y_val, upd="./src/test/scripts/functions/federated/paramserv/CNN.dml::gradients", agg="./src/test/scripts/functions/federated/paramserv/CNN.dml::aggregation", mode=mode, utype=utype, freq=freq, epochs=epochs, batchsize=batch_size, k=workers, scheme=scheme, hyperparams=params, checkpointing="NONE") |
| } |
| |
| /* |
| * Computes the class probability predictions of a convolutional |
| * net using the "LeNet" architecture. |
| * |
| * The input matrix, X, has N examples, each represented as a 3D |
| * volume unrolled into a single vector. |
| * |
| * Inputs: |
| * - X: Input data matrix, of shape (N, C*Hin*Win) |
| * - C: Number of input channels (dimensionality of input depth) |
| * - Hin: Input height |
| * - Win: Input width |
| * - batch_size: Batch size |
| * - model: List containing |
| * - W1: 1st layer weights (parameters) matrix, of shape (F1, C*Hf*Wf) |
| * - b1: 1st layer biases vector, of shape (F1, 1) |
| * - W2: 2nd layer weights (parameters) matrix, of shape (F2, F1*Hf*Wf) |
| * - b2: 2nd layer biases vector, of shape (F2, 1) |
| * - W3: 3rd layer weights (parameters) matrix, of shape (F2*(Hin/4)*(Win/4), N3) |
| * - b3: 3rd layer biases vector, of shape (1, N3) |
| * - W4: 4th layer weights (parameters) matrix, of shape (N3, K) |
| * - b4: 4th layer biases vector, of shape (1, K) |
| * |
| * Outputs: |
| * - probs: Class probabilities, of shape (N, K) |
| */ |
| predict = function(matrix[double] X, int C, int Hin, int Win, int batch_size, list[unknown] model) |
| return (matrix[double] probs) { |
| |
| W1 = as.matrix(model[1]) |
| W2 = as.matrix(model[2]) |
| W3 = as.matrix(model[3]) |
| W4 = as.matrix(model[4]) |
| b1 = as.matrix(model[5]) |
| b2 = as.matrix(model[6]) |
| b3 = as.matrix(model[7]) |
| b4 = as.matrix(model[8]) |
| N = nrow(X) |
| |
| # Network: |
| ## input -> conv1 -> relu1 -> pool1 -> conv2 -> relu2 -> pool2 -> affine3 -> relu3 -> affine4 -> softmax |
| Hf = 5 # filter height |
| Wf = 5 # filter width |
| stride = 1 |
| pad = 2 # For same dimensions, (Hf - stride) / 2 |
| F1 = nrow(W1) # num conv filters in conv1 |
| F2 = nrow(W2) # num conv filters in conv2 |
| N3 = ncol(W3) # num nodes in affine3 |
| K = ncol(W4) # num nodes in affine4, equal to number of target dimensions (num classes) |
| |
| # Compute predictions over mini-batches |
| probs = matrix(0, rows=N, cols=K) |
| iters = ceil(N / batch_size) |
| parfor(i in 1:iters, check=0) { |
| # Get next batch |
| beg = ((i-1) * batch_size) %% N + 1 |
| end = min(N, beg + batch_size - 1) |
| X_batch = X[beg:end,] |
| |
| # Compute forward pass |
| ## layer 1: conv1 -> relu1 -> pool1 |
| [outc1, Houtc1, Woutc1] = conv2d::forward(X_batch, W1, b1, C, Hin, Win, Hf, Wf, stride, stride, |
| pad, pad) |
| outr1 = relu::forward(outc1) |
| [outp1, Houtp1, Woutp1] = max_pool2d::forward(outr1, F1, Houtc1, Woutc1, 2, 2, 2, 2, 0, 0) |
| ## layer 2: conv2 -> relu2 -> pool2 |
| [outc2, Houtc2, Woutc2] = conv2d::forward(outp1, W2, b2, F1, Houtp1, Woutp1, Hf, Wf, |
| stride, stride, pad, pad) |
| outr2 = relu::forward(outc2) |
| [outp2, Houtp2, Woutp2] = max_pool2d::forward(outr2, F2, Houtc2, Woutc2, 2, 2, 2, 2, 0, 0) |
| ## layer 3: affine3 -> relu3 |
| outa3 = affine::forward(outp2, W3, b3) |
| outr3 = relu::forward(outa3) |
| ## layer 4: affine4 -> softmax |
| outa4 = affine::forward(outr3, W4, b4) |
| probs_batch = softmax::forward(outa4) |
| |
| # Store predictions |
| probs[beg:end,] = probs_batch |
| } |
| } |
| |
| /* |
| * Evaluates a convolutional net using the "LeNet" architecture. |
| * |
| * The probs matrix contains the class probability predictions |
| * of K classes over N examples. The targets, y, have K classes, |
| * and are one-hot encoded. |
| * |
| * Inputs: |
| * - probs: Class probabilities, of shape (N, K) |
| * - y: Target matrix, of shape (N, K) |
| * |
| * Outputs: |
| * - loss: Scalar loss, of shape (1) |
| * - accuracy: Scalar accuracy, of shape (1) |
| */ |
| eval = function(matrix[double] probs, matrix[double] y) |
| return (double loss, double accuracy) { |
| |
| # Compute loss & accuracy |
| loss = cross_entropy_loss::forward(probs, y) |
| correct_pred = rowIndexMax(probs) == rowIndexMax(y) |
| accuracy = mean(correct_pred) |
| } |
| |
| # Should always use 'features' (batch features), 'labels' (batch labels), |
| # 'hyperparams', 'model' as the arguments |
| # and return the gradients of type list |
| gradients = function(list[unknown] model, |
| list[unknown] hyperparams, |
| matrix[double] features, |
| matrix[double] labels) |
| return (list[unknown] gradients) { |
| |
| C = as.integer(as.scalar(hyperparams["C"])) |
| Hin = as.integer(as.scalar(hyperparams["Hin"])) |
| Win = as.integer(as.scalar(hyperparams["Win"])) |
| Hf = as.integer(as.scalar(hyperparams["Hf"])) |
| Wf = as.integer(as.scalar(hyperparams["Wf"])) |
| stride = as.integer(as.scalar(hyperparams["stride"])) |
| pad = as.integer(as.scalar(hyperparams["pad"])) |
| lambda = as.double(as.scalar(hyperparams["lambda"])) |
| F1 = as.integer(as.scalar(hyperparams["F1"])) |
| F2 = as.integer(as.scalar(hyperparams["F2"])) |
| N3 = as.integer(as.scalar(hyperparams["N3"])) |
| W1 = as.matrix(model[1]) |
| W2 = as.matrix(model[2]) |
| W3 = as.matrix(model[3]) |
| W4 = as.matrix(model[4]) |
| b1 = as.matrix(model[5]) |
| b2 = as.matrix(model[6]) |
| b3 = as.matrix(model[7]) |
| b4 = as.matrix(model[8]) |
| |
| # Compute forward pass |
| ## layer 1: conv1 -> relu1 -> pool1 |
| [outc1, Houtc1, Woutc1] = conv2d::forward(features, W1, b1, C, Hin, Win, Hf, Wf, |
| stride, stride, pad, pad) |
| outr1 = relu::forward(outc1) |
| [outp1, Houtp1, Woutp1] = max_pool2d::forward(outr1, F1, Houtc1, Woutc1, 2, 2, 2, 2, 0, 0) |
| ## layer 2: conv2 -> relu2 -> pool2 |
| [outc2, Houtc2, Woutc2] = conv2d::forward(outp1, W2, b2, F1, Houtp1, Woutp1, Hf, Wf, |
| stride, stride, pad, pad) |
| outr2 = relu::forward(outc2) |
| [outp2, Houtp2, Woutp2] = max_pool2d::forward(outr2, F2, Houtc2, Woutc2, 2, 2, 2, 2, 0, 0) |
| ## layer 3: affine3 -> relu3 -> dropout |
| outa3 = affine::forward(outp2, W3, b3) |
| outr3 = relu::forward(outa3) |
| [outd3, maskd3] = dropout::forward(outr3, 0.5, -1) |
| ## layer 4: affine4 -> softmax |
| outa4 = affine::forward(outd3, W4, b4) |
| probs = softmax::forward(outa4) |
| |
| # Compute loss & accuracy for training data |
| loss = cross_entropy_loss::forward(probs, labels) |
| accuracy = mean(rowIndexMax(probs) == rowIndexMax(labels)) |
| print("[+] Completed forward pass on batch: train loss: " + loss + ", train accuracy: " + accuracy) |
| |
| # Compute data backward pass |
| ## loss |
| dprobs = cross_entropy_loss::backward(probs, labels) |
| ## layer 4: affine4 -> softmax |
| douta4 = softmax::backward(dprobs, outa4) |
| [doutd3, dW4, db4] = affine::backward(douta4, outr3, W4, b4) |
| ## layer 3: affine3 -> relu3 -> dropout |
| doutr3 = dropout::backward(doutd3, outr3, 0.5, maskd3) |
| douta3 = relu::backward(doutr3, outa3) |
| [doutp2, dW3, db3] = affine::backward(douta3, outp2, W3, b3) |
| ## layer 2: conv2 -> relu2 -> pool2 |
| doutr2 = max_pool2d::backward(doutp2, Houtp2, Woutp2, outr2, F2, Houtc2, Woutc2, 2, 2, 2, 2, 0, 0) |
| doutc2 = relu::backward(doutr2, outc2) |
| [doutp1, dW2, db2] = conv2d::backward(doutc2, Houtc2, Woutc2, outp1, W2, b2, F1, |
| Houtp1, Woutp1, Hf, Wf, stride, stride, pad, pad) |
| ## layer 1: conv1 -> relu1 -> pool1 |
| doutr1 = max_pool2d::backward(doutp1, Houtp1, Woutp1, outr1, F1, Houtc1, Woutc1, 2, 2, 2, 2, 0, 0) |
| doutc1 = relu::backward(doutr1, outc1) |
| [dX_batch, dW1, db1] = conv2d::backward(doutc1, Houtc1, Woutc1, features, W1, b1, C, Hin, Win, |
| Hf, Wf, stride, stride, pad, pad) |
| |
| # Compute regularization backward pass |
| dW1_reg = l2_reg::backward(W1, lambda) |
| dW2_reg = l2_reg::backward(W2, lambda) |
| dW3_reg = l2_reg::backward(W3, lambda) |
| dW4_reg = l2_reg::backward(W4, lambda) |
| dW1 = dW1 + dW1_reg |
| dW2 = dW2 + dW2_reg |
| dW3 = dW3 + dW3_reg |
| dW4 = dW4 + dW4_reg |
| |
| gradients = list(dW1, dW2, dW3, dW4, db1, db2, db3, db4) |
| } |
| |
| # Should use the arguments named 'model', 'gradients', 'hyperparams' |
| # and return always a model of type list |
| aggregation = function(list[unknown] model, |
| list[unknown] hyperparams, |
| list[unknown] gradients) |
| return (list[unknown] model_result) { |
| |
| W1 = as.matrix(model[1]) |
| W2 = as.matrix(model[2]) |
| W3 = as.matrix(model[3]) |
| W4 = as.matrix(model[4]) |
| b1 = as.matrix(model[5]) |
| b2 = as.matrix(model[6]) |
| b3 = as.matrix(model[7]) |
| b4 = as.matrix(model[8]) |
| dW1 = as.matrix(gradients[1]) |
| dW2 = as.matrix(gradients[2]) |
| dW3 = as.matrix(gradients[3]) |
| dW4 = as.matrix(gradients[4]) |
| db1 = as.matrix(gradients[5]) |
| db2 = as.matrix(gradients[6]) |
| db3 = as.matrix(gradients[7]) |
| db4 = as.matrix(gradients[8]) |
| vW1 = as.matrix(model[9]) |
| vW2 = as.matrix(model[10]) |
| vW3 = as.matrix(model[11]) |
| vW4 = as.matrix(model[12]) |
| vb1 = as.matrix(model[13]) |
| vb2 = as.matrix(model[14]) |
| vb3 = as.matrix(model[15]) |
| vb4 = as.matrix(model[16]) |
| learning_rate = as.double(as.scalar(hyperparams["learning_rate"])) |
| mu = as.double(as.scalar(hyperparams["mu"])) |
| |
| # Optimize with SGD w/ Nesterov momentum |
| [W1, vW1] = sgd_nesterov::update(W1, dW1, learning_rate, mu, vW1) |
| [b1, vb1] = sgd_nesterov::update(b1, db1, learning_rate, mu, vb1) |
| [W2, vW2] = sgd_nesterov::update(W2, dW2, learning_rate, mu, vW2) |
| [b2, vb2] = sgd_nesterov::update(b2, db2, learning_rate, mu, vb2) |
| [W3, vW3] = sgd_nesterov::update(W3, dW3, learning_rate, mu, vW3) |
| [b3, vb3] = sgd_nesterov::update(b3, db3, learning_rate, mu, vb3) |
| [W4, vW4] = sgd_nesterov::update(W4, dW4, learning_rate, mu, vW4) |
| [b4, vb4] = sgd_nesterov::update(b4, db4, learning_rate, mu, vb4) |
| |
| model_result = list(W1, W2, W3, W4, b1, b2, b3, b4, vW1, vW2, vW3, vW4, vb1, vb2, vb3, vb4) |
| } |