scripts/nn/README.md - systemds - Git at Google

 <!--
 {% comment %}
 Licensed to the Apache Software Foundation (ASF) under one or more
 contributor license agreements.  See the NOTICE file distributed with
 this work for additional information regarding copyright ownership.
 The ASF licenses this file to you under the Apache License, Version 2.0
 (the "License"); you may not use this file except in compliance with
 the License.  You may obtain a copy of the License at

 http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 {% endcomment %}
 -->

 # SystemDS-NN

 This folder contains different primitives for neural network training, and predictions.
 ## Neural net for regression with vanilla SGD:

 ```R
 # Imports
 source("nn/layers/affine.dml") as affine
 source("nn/layers/l2_loss.dml") as l2_loss
 source("nn/layers/relu.dml") as relu
 source("nn/optim/sgd.dml") as sgd

 # Generate input data
 N = 1024 # num examples
 D = 100 # num features
 t = 1 # num targets
 X = rand(rows=N, cols=D, pdf="normal")
 y = rand(rows=N, cols=t)

 # Create 2-layer network:
 ## affine1 -> relu1 -> affine2
 M = 64 # number of neurons
 [W1, b1] = affine::init(D, M, -1)
 [W2, b2] = affine::init(M, t, -1)

 # Initialize optimizer
 lr = 0.05  # learning rate
 mu = 0.9  # momentum
 decay = 0.99  # learning rate decay constant

 # Optimize
 print("Starting optimization")
 batch_size = 32
 epochs = 5
 iters = 1024 / batch_size
 for (e in 1:epochs) {
   for(i in 1:iters) {
     # Get next batch
     X_batch = X[i:i+batch_size-1,]
     y_batch = y[i:i+batch_size-1,]

     # Compute forward pass
     out1 = affine::forward(X_batch, W1, b1)
     outr1 = relu::forward(out1)
     out2 = affine::forward(outr1, W2, b2)

     # Compute loss
     loss = l2_loss::forward(out2, y_batch)
     print("L2 loss: " + loss)

     # Compute backward pass
     dout2 = l2_loss::backward(out2, y_batch)
     [doutr1, dW2, db2] = affine::backward(dout2, outr1, W2, b2)
     dout1 = relu::backward(doutr1, out1)
     [dX_batch, dW1, db1] = affine::backward(dout1, X_batch, W1, b1)

     # Optimize with vanilla SGD
     W1 = sgd::update(W1, dW1, lr)
     b1 = sgd::update(b1, db1, lr)
     W2 = sgd::update(W2, dW2, lr)
     b2 = sgd::update(b2, db2, lr)
   }
   # Decay learning rate
   lr = lr * decay
 }
 ```

 ## Neural net for multi-class classification with dropout and SGD w/ Nesterov momentum

 ```R
 # Imports
 source("nn/layers/affine.dml") as affine
 source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
 source("nn/layers/dropout.dml") as dropout
 source("nn/layers/relu.dml") as relu
 source("nn/layers/softmax.dml") as softmax
 source("nn/optim/sgd_nesterov.dml") as sgd_nesterov

 # Generate input data
 N = 1024 # num examples
 D = 100 # num features
 t = 5 # num targets
 X = rand(rows=N, cols=D, pdf="normal")
 classes = round(rand(rows=N, cols=1, min=1, max=t, pdf="uniform"))
 y = matrix(0, rows=N, cols=t)
 parfor (i in 1:N) {
   y[i, as.scalar(classes[i,1])] = 1  # one-hot encoding
 }

 # Create network:
 # affine1 -> relu1 -> dropout1 -> affine2 -> relu2 -> dropout2 -> affine3 -> softmax
 H1 = 64 # number of neurons in 1st hidden layer
 H2 = 64 # number of neurons in 2nd hidden layer
 p = 0.5  # dropout probability
 [W1, b1] = affine::init(D, H1, -1)
 [W2, b2] = affine::init(H1, H2, -1)
 [W3, b3] = affine::init(H2, t, -1)

 # Initialize SGD w/ Nesterov momentum optimizer
 lr = 0.05  # learning rate
 mu = 0.5  # momentum
 decay = 0.99  # learning rate decay constant
 vW1 = sgd_nesterov::init(W1); vb1 = sgd_nesterov::init(b1)
 vW2 = sgd_nesterov::init(W2); vb2 = sgd_nesterov::init(b2)
 vW3 = sgd_nesterov::init(W3); vb3 = sgd_nesterov::init(b3)

 # Optimize
 print("Starting optimization")
 batch_size = 64
 epochs = 10
 iters = 1024 / batch_size
 for (e in 1:epochs) {
   for(i in 1:iters) {
     # Get next batch
     X_batch = X[i:i+batch_size-1,]
     y_batch = y[i:i+batch_size-1,]

     # Compute forward pass
     ## layer 1:
     out1 = affine::forward(X_batch, W1, b1)
     outr1 = relu::forward(out1)
     [outd1, maskd1] = dropout::forward(outr1, p, -1)
     ## layer 2:
     out2 = affine::forward(outd1, W2, b2)
     outr2 = relu::forward(out2)
     [outd2, maskd2] = dropout::forward(outr2, p, -1)
     ## layer 3:
     out3 = affine::forward(outd2, W3, b3)
     probs = softmax::forward(out3)

     # Compute loss
     loss = cross_entropy_loss::forward(probs, y_batch)
     print("Cross entropy loss: " + loss)

     # Compute backward pass
     ## loss:
     dprobs = cross_entropy_loss::backward(probs, y_batch)
     ## layer 3:
     dout3 = softmax::backward(dprobs, out3)
     [doutd2, dW3, db3] = affine::backward(dout3, outd2, W3, b3)
     ## layer 2:
     doutr2 = dropout::backward(doutd2, outr2, p, maskd2)
     dout2 = relu::backward(doutr2, out2)
     [doutd1, dW2, db2] = affine::backward(dout2, outd1, W2, b2)
     ## layer 1:
     doutr1 = dropout::backward(doutd1, outr1, p, maskd1)
     dout1 = relu::backward(doutr1, out1)
     [dX_batch, dW1, db1] = affine::backward(dout1, X_batch, W1, b1)

     # Optimize with SGD w/ Nesterov momentum
     [W1, vW1] = sgd_nesterov::update(W1, dW1, lr, mu, vW1)
     [b1, vb1] = sgd_nesterov::update(b1, db1, lr, mu, vb1)
     [W2, vW2] = sgd_nesterov::update(W2, dW2, lr, mu, vW2)
     [b2, vb2] = sgd_nesterov::update(b2, db2, lr, mu, vb2)
     [W3, vW3] = sgd_nesterov::update(W3, dW3, lr, mu, vW3)
     [b3, vb3] = sgd_nesterov::update(b3, db3, lr, mu, vb3)
   }
   # Anneal momentum towards 0.999
   mu = mu + (0.999 - mu)/(1+epochs-e)
   # Decay learning rate
   lr = lr * decay
 }
 ```
	<!--
	{% comment %}
	Licensed to the Apache Software Foundation (ASF) under one or more
	contributor license agreements. See the NOTICE file distributed with
	this work for additional information regarding copyright ownership.
	The ASF licenses this file to you under the Apache License, Version 2.0
	(the "License"); you may not use this file except in compliance with
	the License. You may obtain a copy of the License at

	http://www.apache.org/licenses/LICENSE-2.0

	Unless required by applicable law or agreed to in writing, software
	distributed under the License is distributed on an "AS IS" BASIS,
	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	See the License for the specific language governing permissions and
	limitations under the License.
	{% endcomment %}
	-->

	# SystemDS-NN

	This folder contains different primitives for neural network training, and predictions.
	## Neural net for regression with vanilla SGD:

	```R
	# Imports
	source("nn/layers/affine.dml") as affine
	source("nn/layers/l2_loss.dml") as l2_loss
	source("nn/layers/relu.dml") as relu
	source("nn/optim/sgd.dml") as sgd

	# Generate input data
	N = 1024 # num examples
	D = 100 # num features
	t = 1 # num targets
	X = rand(rows=N, cols=D, pdf="normal")
	y = rand(rows=N, cols=t)

	# Create 2-layer network:
	## affine1 -> relu1 -> affine2
	M = 64 # number of neurons
	[W1, b1] = affine::init(D, M, -1)
	[W2, b2] = affine::init(M, t, -1)

	# Initialize optimizer
	lr = 0.05 # learning rate
	mu = 0.9 # momentum
	decay = 0.99 # learning rate decay constant

	# Optimize
	print("Starting optimization")
	batch_size = 32
	epochs = 5
	iters = 1024 / batch_size
	for (e in 1:epochs) {
	for(i in 1:iters) {
	# Get next batch
	X_batch = X[i:i+batch_size-1,]
	y_batch = y[i:i+batch_size-1,]

	# Compute forward pass
	out1 = affine::forward(X_batch, W1, b1)
	outr1 = relu::forward(out1)
	out2 = affine::forward(outr1, W2, b2)

	# Compute loss
	loss = l2_loss::forward(out2, y_batch)
	print("L2 loss: " + loss)

	# Compute backward pass
	dout2 = l2_loss::backward(out2, y_batch)
	[doutr1, dW2, db2] = affine::backward(dout2, outr1, W2, b2)
	dout1 = relu::backward(doutr1, out1)
	[dX_batch, dW1, db1] = affine::backward(dout1, X_batch, W1, b1)

	# Optimize with vanilla SGD
	W1 = sgd::update(W1, dW1, lr)
	b1 = sgd::update(b1, db1, lr)
	W2 = sgd::update(W2, dW2, lr)
	b2 = sgd::update(b2, db2, lr)
	}
	# Decay learning rate
	lr = lr * decay
	}
	```

	## Neural net for multi-class classification with dropout and SGD w/ Nesterov momentum

	```R
	# Imports
	source("nn/layers/affine.dml") as affine
	source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
	source("nn/layers/dropout.dml") as dropout
	source("nn/layers/relu.dml") as relu
	source("nn/layers/softmax.dml") as softmax
	source("nn/optim/sgd_nesterov.dml") as sgd_nesterov

	# Generate input data
	N = 1024 # num examples
	D = 100 # num features
	t = 5 # num targets
	X = rand(rows=N, cols=D, pdf="normal")
	classes = round(rand(rows=N, cols=1, min=1, max=t, pdf="uniform"))
	y = matrix(0, rows=N, cols=t)
	parfor (i in 1:N) {
	y[i, as.scalar(classes[i,1])] = 1 # one-hot encoding
	}

	# Create network:
	# affine1 -> relu1 -> dropout1 -> affine2 -> relu2 -> dropout2 -> affine3 -> softmax
	H1 = 64 # number of neurons in 1st hidden layer
	H2 = 64 # number of neurons in 2nd hidden layer
	p = 0.5 # dropout probability
	[W1, b1] = affine::init(D, H1, -1)
	[W2, b2] = affine::init(H1, H2, -1)
	[W3, b3] = affine::init(H2, t, -1)

	# Initialize SGD w/ Nesterov momentum optimizer
	lr = 0.05 # learning rate
	mu = 0.5 # momentum
	decay = 0.99 # learning rate decay constant
	vW1 = sgd_nesterov::init(W1); vb1 = sgd_nesterov::init(b1)
	vW2 = sgd_nesterov::init(W2); vb2 = sgd_nesterov::init(b2)
	vW3 = sgd_nesterov::init(W3); vb3 = sgd_nesterov::init(b3)

	# Optimize
	print("Starting optimization")
	batch_size = 64
	epochs = 10
	iters = 1024 / batch_size
	for (e in 1:epochs) {
	for(i in 1:iters) {
	# Get next batch
	X_batch = X[i:i+batch_size-1,]
	y_batch = y[i:i+batch_size-1,]

	# Compute forward pass
	## layer 1:
	out1 = affine::forward(X_batch, W1, b1)
	outr1 = relu::forward(out1)
	[outd1, maskd1] = dropout::forward(outr1, p, -1)
	## layer 2:
	out2 = affine::forward(outd1, W2, b2)
	outr2 = relu::forward(out2)
	[outd2, maskd2] = dropout::forward(outr2, p, -1)
	## layer 3:
	out3 = affine::forward(outd2, W3, b3)
	probs = softmax::forward(out3)

	# Compute loss
	loss = cross_entropy_loss::forward(probs, y_batch)
	print("Cross entropy loss: " + loss)

	# Compute backward pass
	## loss:
	dprobs = cross_entropy_loss::backward(probs, y_batch)
	## layer 3:
	dout3 = softmax::backward(dprobs, out3)
	[doutd2, dW3, db3] = affine::backward(dout3, outd2, W3, b3)
	## layer 2:
	doutr2 = dropout::backward(doutd2, outr2, p, maskd2)
	dout2 = relu::backward(doutr2, out2)
	[doutd1, dW2, db2] = affine::backward(dout2, outd1, W2, b2)
	## layer 1:
	doutr1 = dropout::backward(doutd1, outr1, p, maskd1)
	dout1 = relu::backward(doutr1, out1)
	[dX_batch, dW1, db1] = affine::backward(dout1, X_batch, W1, b1)

	# Optimize with SGD w/ Nesterov momentum
	[W1, vW1] = sgd_nesterov::update(W1, dW1, lr, mu, vW1)
	[b1, vb1] = sgd_nesterov::update(b1, db1, lr, mu, vb1)
	[W2, vW2] = sgd_nesterov::update(W2, dW2, lr, mu, vW2)
	[b2, vb2] = sgd_nesterov::update(b2, db2, lr, mu, vb2)
	[W3, vW3] = sgd_nesterov::update(W3, dW3, lr, mu, vW3)
	[b3, vb3] = sgd_nesterov::update(b3, db3, lr, mu, vb3)
	}
	# Anneal momentum towards 0.999
	mu = mu + (0.999 - mu)/(1+epochs-e)
	# Decay learning rate
	lr = lr * decay
	}
	```