[SYSTEMDS-2637] Leaky relu NN layer, incl grad check and 2NN example

Closes #1026.
diff --git a/scripts/nn/examples/Example-MNIST_2NN_ReLu_Softmax.dml b/scripts/nn/examples/Example-MNIST_2NN_Leaky_ReLu_Softmax.dml
similarity index 71%
rename from scripts/nn/examples/Example-MNIST_2NN_ReLu_Softmax.dml
rename to scripts/nn/examples/Example-MNIST_2NN_Leaky_ReLu_Softmax.dml
index 7b225dc..86a3308 100644
--- a/scripts/nn/examples/Example-MNIST_2NN_ReLu_Softmax.dml
+++ b/scripts/nn/examples/Example-MNIST_2NN_Leaky_ReLu_Softmax.dml
@@ -20,17 +20,32 @@
 #-------------------------------------------------------------
 
 /*
+ * This Example trains a feed forward neural network with one input layer, two hidden affine layers (200 neurons) with
+ * leaky relu activations and one affine output layer with a softmax activation
+ *
+ * The reason for this example is to test the performance differences between single threaded, with a parameter server
+ * for parallelization and finally federated across multiple SystemDS instances
+ *
+ * Inputs:
+ *  - train: The file containing the training data
+ *  - test: the file containing the test data
+ *
  * The MNIST Data can be downloaded as follows:
  * mkdir -p data/mnist/
  * cd data/mnist/
  * curl -O https://pjreddie.com/media/files/mnist_train.csv
  * curl -O https://pjreddie.com/media/files/mnist_test.csv
+ *
+ * Sample Invocation
+ *
+ * systemds "<path to systemds repo>/systemds/scripts/nn/examples/Example-MNIST_2NN_Leaky_ReLu_Softmax.dml"
+ *       -nvargs train=<path to data>/mnist_data/mnist_train.csv test=<path to data>/mnist_data/mnist_test.csv
  */
 
 source("nn/examples/mnist_2NN.dml") as mnist_2NN
 
 # Read training data
-data = read("mnist_data/mnist_train.csv", format="csv")
+data = read($train, format="csv")
 n = nrow(data)
 
 # Extract images and labels
@@ -48,11 +63,11 @@
 y_val = labels[1:5000,]
 
 # Train
-epochs = 5
+epochs = 1
 [W_1, b_1, W_2, b_2, W_3, b_3] = mnist_2NN::train(X, y, X_val, y_val, epochs)
 
 # Read test data
-data = read("mnist_data/mnist_test.csv", format="csv")
+data = read($test, format="csv")
 n = nrow(data)
 
 # Extract images and labels
@@ -67,4 +82,4 @@
 probs = mnist_2NN::predict(X_test, W_1, b_1, W_2, b_2, W_3, b_3)
 [loss, accuracy] = mnist_2NN::eval(probs, y_test)
 
-print("Test Accuracy: " + accuracy)
\ No newline at end of file
+print("Test Accuracy: " + accuracy)
diff --git a/scripts/nn/examples/mnist_2NN.dml b/scripts/nn/examples/mnist_2NN.dml
index 3287867..1307e91 100644
--- a/scripts/nn/examples/mnist_2NN.dml
+++ b/scripts/nn/examples/mnist_2NN.dml
@@ -20,24 +20,23 @@
 #-------------------------------------------------------------
 
 /*
- * MNIST 2NN Relu Example
+ * MNIST 2NN Leaky Relu Example
  */
 
 # Imports
 source("nn/layers/affine.dml") as affine
 source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
-source("nn/layers/relu.dml") as relu
+source("nn/layers/leaky_relu.dml") as leaky_relu
 source("nn/layers/softmax.dml") as softmax
 source("nn/optim/sgd_nesterov.dml") as sgd_nesterov
 
-train = function(matrix[double] X, matrix[double] Y,
-                 matrix[double] X_val, matrix[double] Y_val,
-                 int epochs)
-    return (matrix[double] W_1, matrix[double] b_1,
-            matrix[double] W_2, matrix[double] b_2,
-            matrix[double] W_3, matrix[double] b_3) {
+train = function(matrix[double] X, matrix[double] Y, matrix[double] X_val, 
+                 matrix[double] Y_val, int epochs)
+  return (matrix[double] W_1, matrix[double] b_1, matrix[double] W_2, 
+	        matrix[double] b_2, matrix[double] W_3, matrix[double] b_3)
+{
   /*
-   * Trains a 2NN relu softmax classifier.
+   * Trains a 2 hidden layer leaky relu softmax classifier.
    *
    * The input matrix, X, has N examples, each with D features.
    * The targets, Y, have K classes, and are one-hot encoded.
@@ -53,12 +52,13 @@
    *  - W: Weights (parameters) matrix, of shape (D, M, 3).
    *  - b: Biases vector, of shape (1, M, 3).
    */
+
   N = nrow(X)  # num examples
   D = ncol(X)  # num features
   K = ncol(Y)  # num classes
 
   # Create the network:
-  # input -> 200 neuron affine -> relu -> 200 neuron affine -> relu -> K neurons affine -> softmax
+  # input -> 200 neuron affine -> leaky_relu -> 200 neuron affine -> leaky_relu -> K neurons affine -> softmax
   [W_1, b_1] = affine::init(D, 200)
   [W_2, b_2] = affine::init(200, 200)
   [W_3, b_3] = affine::init(200, K)
@@ -87,12 +87,12 @@
       y_batch = Y[beg:end,]
 
       # Compute forward pass
-      ## input D -> 200 neuron affine -> relu -> 200 neuron affine -> relu -> K neurons affine -> softmax
+      ## input D -> 200 neuron affine -> leaky_relu -> 200 neuron affine -> leaky_relu -> K neurons affine -> softmax
       out_1 = affine::forward(X_batch, W_1, b_1)
-      out_1_relu = relu::forward(out_1)
-      out_2 = affine::forward(out_1_relu, W_2, b_2)
-      out_2_relu = relu::forward(out_2)
-      out_3 = affine::forward(out_2_relu, W_3, b_3)
+      out_1_leaky_relu = leaky_relu::forward(out_1)
+      out_2 = affine::forward(out_1_leaky_relu, W_2, b_2)
+      out_2_leaky_relu = leaky_relu::forward(out_2)
+      out_3 = affine::forward(out_2_leaky_relu, W_3, b_3)
       probs = softmax::forward(out_3)
 
       # Compute loss & accuracy for training & validation data
@@ -102,16 +102,16 @@
       loss_val = cross_entropy_loss::forward(probs_val, Y_val)
       accuracy_val = mean(rowIndexMax(probs_val) == rowIndexMax(Y_val))
       print("Epoch: " + e + ", Iter: " + i + ", Train Loss: " + loss + ", Train Accuracy: " +
-            accuracy + ", Val Loss: " + loss_val + ", Val Accuracy: " + accuracy_val)
+      accuracy + ", Val Loss: " + loss_val + ", Val Accuracy: " + accuracy_val)
 
       # Compute backward pass
       ## loss:
       dprobs = cross_entropy_loss::backward(probs, y_batch)
       dout_3 = softmax::backward(dprobs, out_3)
-      [dout_2_relu, dW_3, db_3] = affine::backward(dout_3, out_2_relu, W_3, b_3)
-      dout_2 = relu::backward(dout_2_relu, out_2)
-      [dout_1_relu, dW_2, db_2] = affine::backward(dout_2, out_1_relu, W_2, b_2)
-      dout_1 = relu::backward(dout_1_relu, out_1)
+      [dout_2_leaky_relu, dW_3, db_3] = affine::backward(dout_3, out_2_leaky_relu, W_3, b_3)
+      dout_2 = leaky_relu::backward(dout_2_leaky_relu, out_2)
+      [dout_1_leaky_relu, dW_2, db_2] = affine::backward(dout_2, out_1_leaky_relu, W_2, b_2)
+      dout_1 = leaky_relu::backward(dout_1_leaky_relu, out_1)
       [dX_batch, dW_1, db_1] = affine::backward(dout_1, X_batch, W_1, b_1)
 
       # Optimize with SGD
@@ -122,6 +122,7 @@
       [W_1, vW_1] = sgd_nesterov::update(W_1, dW_1, lr, mu, vW_1)
       [b_1, vb_1] = sgd_nesterov::update(b_1, db_1, lr, mu, vb_1)
     }
+
     # Anneal momentum towards 0.999
     mu = mu + (0.999 - mu)/(1+epochs-e)
     # Decay learning rate
@@ -130,9 +131,9 @@
 }
 
 predict = function(matrix[double] X,
-                    matrix[double] W_1, matrix[double] b_1,
-                    matrix[double] W_2, matrix[double] b_2,
-                    matrix[double] W_3, matrix[double] b_3)
+                   matrix[double] W_1, matrix[double] b_1,
+                   matrix[double] W_2, matrix[double] b_2,
+                   matrix[double] W_3, matrix[double] b_3)
     return (matrix[double] probs) {
   /*
    * Computes the class probability predictions of a softmax classifier.
@@ -148,10 +149,10 @@
    *  - probs: Class probabilities, of shape (N, K).
    */
   # Compute forward pass
-  ## input -> 200 neuron affine -> relu -> 200 neuron affine -> relu -> K neurons affine -> softmax
-  out_1_relu = relu::forward(affine::forward(X, W_1, b_1))
-  out_2_relu = relu::forward(affine::forward(out_1_relu, W_2, b_2))
-  probs = softmax::forward(affine::forward(out_2_relu, W_3, b_3))
+  ## input -> 200 neuron affine -> leaky_relu -> 200 neuron affine -> leaky_relu -> K neurons affine -> softmax
+  out_1_leaky_relu = leaky_relu::forward(affine::forward(X, W_1, b_1))
+  out_2_leaky_relu = leaky_relu::forward(affine::forward(out_1_leaky_relu, W_2, b_2))
+  probs = softmax::forward(affine::forward(out_2_leaky_relu, W_3, b_3))
 }
 
 eval = function(matrix[double] probs, matrix[double] Y)
diff --git a/scripts/nn/layers/leaky_relu.dml b/scripts/nn/layers/leaky_relu.dml
new file mode 100644
index 0000000..82fe3a1
--- /dev/null
+++ b/scripts/nn/layers/leaky_relu.dml
@@ -0,0 +1,54 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Leaky Rectified Linear Unit (ReLU) nonlinearity layer.
+ */
+
+forward = function(matrix[double] X)
+    return (matrix[double] out) {
+  /*
+   * Computes the forward pass for a Leaky ReLU nonlinearity layer.
+   *
+   * Inputs:
+   *  - X: Inputs, of shape (any, any).
+   *
+   * Outputs:
+   *  - out: Outputs, of same shape as `X`.
+   */
+  # compute ifelse(X > 0, X, 0.01 * X)
+  out = max(X, 0.01 * X)
+}
+
+backward = function(matrix[double] dout, matrix[double] X)
+    return (matrix[double] dX) {
+  /*
+   * Computes the backward pass for a Leaky ReLU nonlinearity layer.
+   *
+   * Inputs:
+   *  - dout: Gradient wrt `out` from upstream, of same shape as `X`.
+   *  - X: Previous input data matrix, of shape (any, any).
+   *
+   * Outputs:
+   *  - dX: Gradient wrt `X`, of same shape as `X`.
+   */
+  dX = ifelse(X > 0, dout, 0.01 * dout)
+}
diff --git a/scripts/nn/test/grad_check.dml b/scripts/nn/test/grad_check.dml
index 56585c0..c409707 100644
--- a/scripts/nn/test/grad_check.dml
+++ b/scripts/nn/test/grad_check.dml
@@ -46,6 +46,7 @@
 source("scripts/nn/layers/avg_pool2d_builtin.dml") as avg_pool2d_builtin
 source("scripts/nn/layers/upsample2d.dml") as upsample2d
 source("scripts/nn/layers/relu.dml") as relu
+source("scripts/nn/layers/leaky_relu.dml") as leaky_relu
 source("scripts/nn/layers/rnn.dml") as rnn
 source("scripts/nn/layers/scale_shift1d.dml") as scale_shift1d
 source("scripts/nn/layers/scale_shift2d.dml") as scale_shift2d
@@ -1827,6 +1828,50 @@
   }
 }
 
+leaky_relu = function() {
+  /*
+  * Gradient check for the ReLU nonlinearity layer.
+  *
+  * NOTE: This could result in a false-negative in which the test
+  * fails due to a kink being crossed in the nonlinearity.  This
+  * occurs when the tests, f(x-h) and f(x+h), end up on opposite
+  * sides of the zero threshold of max(0, fx).  For now, just run
+  * the tests again.  In the future, we can explicitly check for
+  * this and rerun the test automatically.
+  */
+  print("Grad checking the Leaky ReLU nonlinearity layer with L2 loss.")
+
+  # Generate data
+  N = 3 # num examples
+  M = 10 # num neurons
+  X = rand(rows=N, cols=M, min=-5, max=5)
+  y = rand(rows=N, cols=M)
+  # Compute analytical gradients of loss wrt parameters
+  out = leaky_relu::forward(X)
+  dout = l2_loss::backward(out, y)
+  dX = leaky_relu::backward(dout, X)
+
+  # Grad check
+  h = 1e-5
+  for (i in 1:nrow(X)) {
+    for (j in 1:ncol(X)) {
+      # Compute numerical derivative
+      old = as.scalar(X[i,j])
+      X[i,j] = old - h
+      outmh = leaky_relu::forward(X)
+      lossmh = l2_loss::forward(outmh, y)
+      X[i,j] = old + h
+      outph = leaky_relu::forward(X)
+      lossph = l2_loss::forward(outph, y)
+      X[i,j] = old  # reset
+      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+    }
+  }
+}
+
 rnn = function() {
   /*
    * Gradient check for the simple RNN layer.
diff --git a/scripts/nn/test/run_tests.dml b/scripts/nn/test/run_tests.dml
index 36f1583..4c37bf4 100644
--- a/scripts/nn/test/run_tests.dml
+++ b/scripts/nn/test/run_tests.dml
@@ -60,6 +60,7 @@
 grad_check::avg_pool2d_builtin()
 grad_check::upsample2d()
 grad_check::relu()
+grad_check::leaky_relu()
 grad_check::rnn()
 grad_check::scale_shift1d()
 grad_check::scale_shift2d()