[SYSTEMDS-2637] Leaky relu NN layer, incl grad check and 2NN example
Closes #1026.
diff --git a/scripts/nn/examples/Example-MNIST_2NN_ReLu_Softmax.dml b/scripts/nn/examples/Example-MNIST_2NN_Leaky_ReLu_Softmax.dml
similarity index 71%
rename from scripts/nn/examples/Example-MNIST_2NN_ReLu_Softmax.dml
rename to scripts/nn/examples/Example-MNIST_2NN_Leaky_ReLu_Softmax.dml
index 7b225dc..86a3308 100644
--- a/scripts/nn/examples/Example-MNIST_2NN_ReLu_Softmax.dml
+++ b/scripts/nn/examples/Example-MNIST_2NN_Leaky_ReLu_Softmax.dml
@@ -20,17 +20,32 @@
#-------------------------------------------------------------
/*
+ * This Example trains a feed forward neural network with one input layer, two hidden affine layers (200 neurons) with
+ * leaky relu activations and one affine output layer with a softmax activation
+ *
+ * The reason for this example is to test the performance differences between single threaded, with a parameter server
+ * for parallelization and finally federated across multiple SystemDS instances
+ *
+ * Inputs:
+ * - train: The file containing the training data
+ * - test: the file containing the test data
+ *
* The MNIST Data can be downloaded as follows:
* mkdir -p data/mnist/
* cd data/mnist/
* curl -O https://pjreddie.com/media/files/mnist_train.csv
* curl -O https://pjreddie.com/media/files/mnist_test.csv
+ *
+ * Sample Invocation
+ *
+ * systemds "<path to systemds repo>/systemds/scripts/nn/examples/Example-MNIST_2NN_Leaky_ReLu_Softmax.dml"
+ * -nvargs train=<path to data>/mnist_data/mnist_train.csv test=<path to data>/mnist_data/mnist_test.csv
*/
source("nn/examples/mnist_2NN.dml") as mnist_2NN
# Read training data
-data = read("mnist_data/mnist_train.csv", format="csv")
+data = read($train, format="csv")
n = nrow(data)
# Extract images and labels
@@ -48,11 +63,11 @@
y_val = labels[1:5000,]
# Train
-epochs = 5
+epochs = 1
[W_1, b_1, W_2, b_2, W_3, b_3] = mnist_2NN::train(X, y, X_val, y_val, epochs)
# Read test data
-data = read("mnist_data/mnist_test.csv", format="csv")
+data = read($test, format="csv")
n = nrow(data)
# Extract images and labels
@@ -67,4 +82,4 @@
probs = mnist_2NN::predict(X_test, W_1, b_1, W_2, b_2, W_3, b_3)
[loss, accuracy] = mnist_2NN::eval(probs, y_test)
-print("Test Accuracy: " + accuracy)
\ No newline at end of file
+print("Test Accuracy: " + accuracy)
diff --git a/scripts/nn/examples/mnist_2NN.dml b/scripts/nn/examples/mnist_2NN.dml
index 3287867..1307e91 100644
--- a/scripts/nn/examples/mnist_2NN.dml
+++ b/scripts/nn/examples/mnist_2NN.dml
@@ -20,24 +20,23 @@
#-------------------------------------------------------------
/*
- * MNIST 2NN Relu Example
+ * MNIST 2NN Leaky Relu Example
*/
# Imports
source("nn/layers/affine.dml") as affine
source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
-source("nn/layers/relu.dml") as relu
+source("nn/layers/leaky_relu.dml") as leaky_relu
source("nn/layers/softmax.dml") as softmax
source("nn/optim/sgd_nesterov.dml") as sgd_nesterov
-train = function(matrix[double] X, matrix[double] Y,
- matrix[double] X_val, matrix[double] Y_val,
- int epochs)
- return (matrix[double] W_1, matrix[double] b_1,
- matrix[double] W_2, matrix[double] b_2,
- matrix[double] W_3, matrix[double] b_3) {
+train = function(matrix[double] X, matrix[double] Y, matrix[double] X_val,
+ matrix[double] Y_val, int epochs)
+ return (matrix[double] W_1, matrix[double] b_1, matrix[double] W_2,
+ matrix[double] b_2, matrix[double] W_3, matrix[double] b_3)
+{
/*
- * Trains a 2NN relu softmax classifier.
+ * Trains a 2 hidden layer leaky relu softmax classifier.
*
* The input matrix, X, has N examples, each with D features.
* The targets, Y, have K classes, and are one-hot encoded.
@@ -53,12 +52,13 @@
* - W: Weights (parameters) matrix, of shape (D, M, 3).
* - b: Biases vector, of shape (1, M, 3).
*/
+
N = nrow(X) # num examples
D = ncol(X) # num features
K = ncol(Y) # num classes
# Create the network:
- # input -> 200 neuron affine -> relu -> 200 neuron affine -> relu -> K neurons affine -> softmax
+ # input -> 200 neuron affine -> leaky_relu -> 200 neuron affine -> leaky_relu -> K neurons affine -> softmax
[W_1, b_1] = affine::init(D, 200)
[W_2, b_2] = affine::init(200, 200)
[W_3, b_3] = affine::init(200, K)
@@ -87,12 +87,12 @@
y_batch = Y[beg:end,]
# Compute forward pass
- ## input D -> 200 neuron affine -> relu -> 200 neuron affine -> relu -> K neurons affine -> softmax
+ ## input D -> 200 neuron affine -> leaky_relu -> 200 neuron affine -> leaky_relu -> K neurons affine -> softmax
out_1 = affine::forward(X_batch, W_1, b_1)
- out_1_relu = relu::forward(out_1)
- out_2 = affine::forward(out_1_relu, W_2, b_2)
- out_2_relu = relu::forward(out_2)
- out_3 = affine::forward(out_2_relu, W_3, b_3)
+ out_1_leaky_relu = leaky_relu::forward(out_1)
+ out_2 = affine::forward(out_1_leaky_relu, W_2, b_2)
+ out_2_leaky_relu = leaky_relu::forward(out_2)
+ out_3 = affine::forward(out_2_leaky_relu, W_3, b_3)
probs = softmax::forward(out_3)
# Compute loss & accuracy for training & validation data
@@ -102,16 +102,16 @@
loss_val = cross_entropy_loss::forward(probs_val, Y_val)
accuracy_val = mean(rowIndexMax(probs_val) == rowIndexMax(Y_val))
print("Epoch: " + e + ", Iter: " + i + ", Train Loss: " + loss + ", Train Accuracy: " +
- accuracy + ", Val Loss: " + loss_val + ", Val Accuracy: " + accuracy_val)
+ accuracy + ", Val Loss: " + loss_val + ", Val Accuracy: " + accuracy_val)
# Compute backward pass
## loss:
dprobs = cross_entropy_loss::backward(probs, y_batch)
dout_3 = softmax::backward(dprobs, out_3)
- [dout_2_relu, dW_3, db_3] = affine::backward(dout_3, out_2_relu, W_3, b_3)
- dout_2 = relu::backward(dout_2_relu, out_2)
- [dout_1_relu, dW_2, db_2] = affine::backward(dout_2, out_1_relu, W_2, b_2)
- dout_1 = relu::backward(dout_1_relu, out_1)
+ [dout_2_leaky_relu, dW_3, db_3] = affine::backward(dout_3, out_2_leaky_relu, W_3, b_3)
+ dout_2 = leaky_relu::backward(dout_2_leaky_relu, out_2)
+ [dout_1_leaky_relu, dW_2, db_2] = affine::backward(dout_2, out_1_leaky_relu, W_2, b_2)
+ dout_1 = leaky_relu::backward(dout_1_leaky_relu, out_1)
[dX_batch, dW_1, db_1] = affine::backward(dout_1, X_batch, W_1, b_1)
# Optimize with SGD
@@ -122,6 +122,7 @@
[W_1, vW_1] = sgd_nesterov::update(W_1, dW_1, lr, mu, vW_1)
[b_1, vb_1] = sgd_nesterov::update(b_1, db_1, lr, mu, vb_1)
}
+
# Anneal momentum towards 0.999
mu = mu + (0.999 - mu)/(1+epochs-e)
# Decay learning rate
@@ -130,9 +131,9 @@
}
predict = function(matrix[double] X,
- matrix[double] W_1, matrix[double] b_1,
- matrix[double] W_2, matrix[double] b_2,
- matrix[double] W_3, matrix[double] b_3)
+ matrix[double] W_1, matrix[double] b_1,
+ matrix[double] W_2, matrix[double] b_2,
+ matrix[double] W_3, matrix[double] b_3)
return (matrix[double] probs) {
/*
* Computes the class probability predictions of a softmax classifier.
@@ -148,10 +149,10 @@
* - probs: Class probabilities, of shape (N, K).
*/
# Compute forward pass
- ## input -> 200 neuron affine -> relu -> 200 neuron affine -> relu -> K neurons affine -> softmax
- out_1_relu = relu::forward(affine::forward(X, W_1, b_1))
- out_2_relu = relu::forward(affine::forward(out_1_relu, W_2, b_2))
- probs = softmax::forward(affine::forward(out_2_relu, W_3, b_3))
+ ## input -> 200 neuron affine -> leaky_relu -> 200 neuron affine -> leaky_relu -> K neurons affine -> softmax
+ out_1_leaky_relu = leaky_relu::forward(affine::forward(X, W_1, b_1))
+ out_2_leaky_relu = leaky_relu::forward(affine::forward(out_1_leaky_relu, W_2, b_2))
+ probs = softmax::forward(affine::forward(out_2_leaky_relu, W_3, b_3))
}
eval = function(matrix[double] probs, matrix[double] Y)
diff --git a/scripts/nn/layers/leaky_relu.dml b/scripts/nn/layers/leaky_relu.dml
new file mode 100644
index 0000000..82fe3a1
--- /dev/null
+++ b/scripts/nn/layers/leaky_relu.dml
@@ -0,0 +1,54 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Leaky Rectified Linear Unit (ReLU) nonlinearity layer.
+ */
+
+forward = function(matrix[double] X)
+ return (matrix[double] out) {
+ /*
+ * Computes the forward pass for a Leaky ReLU nonlinearity layer.
+ *
+ * Inputs:
+ * - X: Inputs, of shape (any, any).
+ *
+ * Outputs:
+ * - out: Outputs, of same shape as `X`.
+ */
+ # compute ifelse(X > 0, X, 0.01 * X)
+ out = max(X, 0.01 * X)
+}
+
+backward = function(matrix[double] dout, matrix[double] X)
+ return (matrix[double] dX) {
+ /*
+ * Computes the backward pass for a Leaky ReLU nonlinearity layer.
+ *
+ * Inputs:
+ * - dout: Gradient wrt `out` from upstream, of same shape as `X`.
+ * - X: Previous input data matrix, of shape (any, any).
+ *
+ * Outputs:
+ * - dX: Gradient wrt `X`, of same shape as `X`.
+ */
+ dX = ifelse(X > 0, dout, 0.01 * dout)
+}
diff --git a/scripts/nn/test/grad_check.dml b/scripts/nn/test/grad_check.dml
index 56585c0..c409707 100644
--- a/scripts/nn/test/grad_check.dml
+++ b/scripts/nn/test/grad_check.dml
@@ -46,6 +46,7 @@
source("scripts/nn/layers/avg_pool2d_builtin.dml") as avg_pool2d_builtin
source("scripts/nn/layers/upsample2d.dml") as upsample2d
source("scripts/nn/layers/relu.dml") as relu
+source("scripts/nn/layers/leaky_relu.dml") as leaky_relu
source("scripts/nn/layers/rnn.dml") as rnn
source("scripts/nn/layers/scale_shift1d.dml") as scale_shift1d
source("scripts/nn/layers/scale_shift2d.dml") as scale_shift2d
@@ -1827,6 +1828,50 @@
}
}
+leaky_relu = function() {
+ /*
+ * Gradient check for the ReLU nonlinearity layer.
+ *
+ * NOTE: This could result in a false-negative in which the test
+ * fails due to a kink being crossed in the nonlinearity. This
+ * occurs when the tests, f(x-h) and f(x+h), end up on opposite
+ * sides of the zero threshold of max(0, fx). For now, just run
+ * the tests again. In the future, we can explicitly check for
+ * this and rerun the test automatically.
+ */
+ print("Grad checking the Leaky ReLU nonlinearity layer with L2 loss.")
+
+ # Generate data
+ N = 3 # num examples
+ M = 10 # num neurons
+ X = rand(rows=N, cols=M, min=-5, max=5)
+ y = rand(rows=N, cols=M)
+ # Compute analytical gradients of loss wrt parameters
+ out = leaky_relu::forward(X)
+ dout = l2_loss::backward(out, y)
+ dX = leaky_relu::backward(dout, X)
+
+ # Grad check
+ h = 1e-5
+ for (i in 1:nrow(X)) {
+ for (j in 1:ncol(X)) {
+ # Compute numerical derivative
+ old = as.scalar(X[i,j])
+ X[i,j] = old - h
+ outmh = leaky_relu::forward(X)
+ lossmh = l2_loss::forward(outmh, y)
+ X[i,j] = old + h
+ outph = leaky_relu::forward(X)
+ lossph = l2_loss::forward(outph, y)
+ X[i,j] = old # reset
+ dX_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+ }
+ }
+}
+
rnn = function() {
/*
* Gradient check for the simple RNN layer.
diff --git a/scripts/nn/test/run_tests.dml b/scripts/nn/test/run_tests.dml
index 36f1583..4c37bf4 100644
--- a/scripts/nn/test/run_tests.dml
+++ b/scripts/nn/test/run_tests.dml
@@ -60,6 +60,7 @@
grad_check::avg_pool2d_builtin()
grad_check::upsample2d()
grad_check::relu()
+grad_check::leaky_relu()
grad_check::rnn()
grad_check::scale_shift1d()
grad_check::scale_shift2d()