blob: e9a59a707f0d92981238c6435a74f6cf0bf08e28 [file] [log] [blame] [view]
<!--
{% comment %}
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to you under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
{% endcomment %}
-->
# SystemDS-NN
This folder contains different primitives for neural network training, and predictions.
## Neural net for regression with vanilla SGD:
```R
# Imports
source("nn/layers/affine.dml") as affine
source("nn/layers/l2_loss.dml") as l2_loss
source("nn/layers/relu.dml") as relu
source("nn/optim/sgd.dml") as sgd
# Generate input data
N = 1024 # num examples
D = 100 # num features
t = 1 # num targets
X = rand(rows=N, cols=D, pdf="normal")
y = rand(rows=N, cols=t)
# Create 2-layer network:
## affine1 -> relu1 -> affine2
M = 64 # number of neurons
[W1, b1] = affine::init(D, M, -1)
[W2, b2] = affine::init(M, t, -1)
# Initialize optimizer
lr = 0.05 # learning rate
mu = 0.9 # momentum
decay = 0.99 # learning rate decay constant
# Optimize
print("Starting optimization")
batch_size = 32
epochs = 5
iters = 1024 / batch_size
for (e in 1:epochs) {
for(i in 1:iters) {
# Get next batch
X_batch = X[i:i+batch_size-1,]
y_batch = y[i:i+batch_size-1,]
# Compute forward pass
out1 = affine::forward(X_batch, W1, b1)
outr1 = relu::forward(out1)
out2 = affine::forward(outr1, W2, b2)
# Compute loss
loss = l2_loss::forward(out2, y_batch)
print("L2 loss: " + loss)
# Compute backward pass
dout2 = l2_loss::backward(out2, y_batch)
[doutr1, dW2, db2] = affine::backward(dout2, outr1, W2, b2)
dout1 = relu::backward(doutr1, out1)
[dX_batch, dW1, db1] = affine::backward(dout1, X_batch, W1, b1)
# Optimize with vanilla SGD
W1 = sgd::update(W1, dW1, lr)
b1 = sgd::update(b1, db1, lr)
W2 = sgd::update(W2, dW2, lr)
b2 = sgd::update(b2, db2, lr)
}
# Decay learning rate
lr = lr * decay
}
```
## Neural net for multi-class classification with dropout and SGD w/ Nesterov momentum
```R
# Imports
source("nn/layers/affine.dml") as affine
source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
source("nn/layers/dropout.dml") as dropout
source("nn/layers/relu.dml") as relu
source("nn/layers/softmax.dml") as softmax
source("nn/optim/sgd_nesterov.dml") as sgd_nesterov
# Generate input data
N = 1024 # num examples
D = 100 # num features
t = 5 # num targets
X = rand(rows=N, cols=D, pdf="normal")
classes = round(rand(rows=N, cols=1, min=1, max=t, pdf="uniform"))
y = matrix(0, rows=N, cols=t)
parfor (i in 1:N) {
y[i, as.scalar(classes[i,1])] = 1 # one-hot encoding
}
# Create network:
# affine1 -> relu1 -> dropout1 -> affine2 -> relu2 -> dropout2 -> affine3 -> softmax
H1 = 64 # number of neurons in 1st hidden layer
H2 = 64 # number of neurons in 2nd hidden layer
p = 0.5 # dropout probability
[W1, b1] = affine::init(D, H1, -1)
[W2, b2] = affine::init(H1, H2, -1)
[W3, b3] = affine::init(H2, t, -1)
# Initialize SGD w/ Nesterov momentum optimizer
lr = 0.05 # learning rate
mu = 0.5 # momentum
decay = 0.99 # learning rate decay constant
vW1 = sgd_nesterov::init(W1); vb1 = sgd_nesterov::init(b1)
vW2 = sgd_nesterov::init(W2); vb2 = sgd_nesterov::init(b2)
vW3 = sgd_nesterov::init(W3); vb3 = sgd_nesterov::init(b3)
# Optimize
print("Starting optimization")
batch_size = 64
epochs = 10
iters = 1024 / batch_size
for (e in 1:epochs) {
for(i in 1:iters) {
# Get next batch
X_batch = X[i:i+batch_size-1,]
y_batch = y[i:i+batch_size-1,]
# Compute forward pass
## layer 1:
out1 = affine::forward(X_batch, W1, b1)
outr1 = relu::forward(out1)
[outd1, maskd1] = dropout::forward(outr1, p, -1)
## layer 2:
out2 = affine::forward(outd1, W2, b2)
outr2 = relu::forward(out2)
[outd2, maskd2] = dropout::forward(outr2, p, -1)
## layer 3:
out3 = affine::forward(outd2, W3, b3)
probs = softmax::forward(out3)
# Compute loss
loss = cross_entropy_loss::forward(probs, y_batch)
print("Cross entropy loss: " + loss)
# Compute backward pass
## loss:
dprobs = cross_entropy_loss::backward(probs, y_batch)
## layer 3:
dout3 = softmax::backward(dprobs, out3)
[doutd2, dW3, db3] = affine::backward(dout3, outd2, W3, b3)
## layer 2:
doutr2 = dropout::backward(doutd2, outr2, p, maskd2)
dout2 = relu::backward(doutr2, out2)
[doutd1, dW2, db2] = affine::backward(dout2, outd1, W2, b2)
## layer 1:
doutr1 = dropout::backward(doutd1, outr1, p, maskd1)
dout1 = relu::backward(doutr1, out1)
[dX_batch, dW1, db1] = affine::backward(dout1, X_batch, W1, b1)
# Optimize with SGD w/ Nesterov momentum
[W1, vW1] = sgd_nesterov::update(W1, dW1, lr, mu, vW1)
[b1, vb1] = sgd_nesterov::update(b1, db1, lr, mu, vb1)
[W2, vW2] = sgd_nesterov::update(W2, dW2, lr, mu, vW2)
[b2, vb2] = sgd_nesterov::update(b2, db2, lr, mu, vb2)
[W3, vW3] = sgd_nesterov::update(W3, dW3, lr, mu, vW3)
[b3, vb3] = sgd_nesterov::update(b3, db3, lr, mu, vb3)
}
# Anneal momentum towards 0.999
mu = mu + (0.999 - mu)/(1+epochs-e)
# Decay learning rate
lr = lr * decay
}
```