| #------------------------------------------------------------- |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| #------------------------------------------------------------- |
| |
| # Trains a 2-layer autoencoder with minibatch SGD, momentum and step-size decay. |
| # If invoked with H1 > H2 then it becomes a 'bowtie' structured autoencoder |
| # Weights are initialized using Glorot & Bengio (2010) AISTATS initialization. |
| # The script standardizes the input before training (can be turned off). |
| # Also, it randomly reshuffles rows before training. |
| # Currently, tanh is set to be the activation function. |
| # By re-implementing 'func' DML-bodied function, one can change the activation. |
| |
| # INPUT PARAMETERS: |
| # --------------------------------------------------------------------------------------------- |
| # NAME TYPE DEFAULT MEANING |
| # --------------------------------------------------------------------------------------------- |
| # X String --- Filename where the input is stored |
| # H1 Int --- Number of neurons in the 1st hidden layer |
| # H2 Int --- Number of neurons in the 2nd hidden layer |
| # EPOCH Int --- Number of epochs to train for |
| # fmt String 'text' Output format ("text", "csv", "binary" etc.) |
| # OBJ Boolean FALSE If TRUE, Computes objective function value (squared-loss) |
| # at the end of each epoch. Note that, computing the full |
| # objective can take a lot of time. |
| # BATCH Int 256 Mini-batch size (training parameter) |
| # STEP Double 1e-5 Initial step size (training parameter) |
| # DECAY Double 0.95 Decays step size after each epoch (training parameter) |
| # MOMENTUM Double 0.9 Momentum parameter (training parameter) |
| # --------------------------------------------------------------------------------------------- |
| # |
| # OUTPUT PARAMETERS (all filenames): |
| # --------------------------------------------------------------------------------------------- |
| # NAME TYPE DEFAULT MEANING |
| # --------------------------------------------------------------------------------------------- |
| # W1_out String --- File to store weights between input layer and 1st hidden layer |
| # b1_out String --- File to store bias between input layer and 1st hidden layer |
| # W2_out String --- File to store weights between 1st hidden layer and 2nd hidden layer |
| # b2_out String --- File to store bias between 1st hidden layer and 2nd hidden layer |
| # W3_out String --- File to store weights between 2nd hidden layer and 3rd hidden layer |
| # b3_out String --- File to store bias between 2nd hidden layer and 3rd hidden layer |
| # W4_out String --- File to store weights between 3rd hidden layer and output layer |
| # b4_out String --- File to store bias between 3rd hidden layer and output layer |
| # HIDDEN String " " File to store the hidden (2nd) layer representation if needed |
| # --------------------------------------------------------------------------------------------- |
| # |
| # INVOCATION: |
| # -f autoencoder_2layer.dml --nvargs X=<input file> H1=500 H2=2 EPOCH=1 fmt="csv" |
| # W1_out=<weights from input to 1st hidden layer> b1_out=<bias from input to 1st hidden layer> |
| # W2_out=<weights from 1st hidden layer to 2nd hidden layer> b2_out=<bias from 1st hidden layer to 2nd hidden layer> |
| # W3_out=<weights from 2nd hidden layer to 3rd hidden layer> b3_out=<bias from 2nd hidden layer to 3rd hidden layer> |
| # W4_out=<weights from 3rd hidden layer to output> b4_out=<bias from 3rd hidden layer to output> |
| # |
| |
| #implements tanh |
| #to use another activation fn, implement a DML-bodied function with |
| #function name 'func' and comment out this one |
| func = function(Matrix[Double] X) return(Matrix[Double] Y, Matrix[Double] Y_prime){ |
| Y = (exp(2*X) - 1)/(exp(2*X) + 1) |
| Y_prime = 1 - Y^2 |
| } |
| |
| feedForward = function(Matrix[Double] X, |
| Matrix[Double] W1, Matrix[Double] b1, |
| Matrix[Double] W2, Matrix[Double] b2, |
| Matrix[Double] W3, Matrix[Double] b3, |
| Matrix[Double] W4, Matrix[Double] b4, |
| Matrix[Double] Y) |
| return(Matrix[Double] H1, Matrix[Double] H1_prime, |
| Matrix[Double] H2, Matrix[Double] H2_prime, |
| Matrix[Double] H3, Matrix[Double] H3_prime, |
| Matrix[Double] Yhat, Matrix[Double] Yhat_prime, |
| Matrix[Double] E){ |
| H1_in = t(W1 %*% t(X) + b1) |
| [H1, H1_prime] = func(H1_in) |
| |
| H2_in = t(W2 %*% t(H1) + b2) |
| [H2, H2_prime] = func(H2_in) |
| |
| H3_in = t(W3 %*% t(H2) + b3) |
| [H3, H3_prime] = func(H3_in) |
| |
| Yhat_in = t(W4 %*% t(H3) + b4) |
| [Yhat, Yhat_prime] = func(Yhat_in) |
| E = Yhat - Y |
| } |
| |
| grad = function(Matrix[Double] X, |
| Matrix[Double] H1, Matrix[Double] H1_prime, |
| Matrix[Double] H2, Matrix[Double] H2_prime, |
| Matrix[Double] H3, Matrix[Double] H3_prime, |
| Matrix[Double] Yhat_prime, |
| Matrix[Double] E, |
| Matrix[Double] W1, Matrix[Double] W2, Matrix[Double] W3, Matrix[Double] W4) |
| return(Matrix[Double] W1_grad, Matrix[Double] b1_grad, |
| Matrix[Double] W2_grad, Matrix[Double] b2_grad, |
| Matrix[Double] W3_grad, Matrix[Double] b3_grad, |
| Matrix[Double] W4_grad, Matrix[Double] b4_grad){ |
| #backprop |
| delta4 = E * Yhat_prime |
| delta3 = H3_prime * (delta4 %*% W4) |
| delta2 = H2_prime * (delta3 %*% W3) |
| delta1 = H1_prime * (delta2 %*% W2) |
| |
| #compute gradients |
| b4_grad = t(colSums(delta4)) |
| b3_grad = t(colSums(delta3)) |
| b2_grad = t(colSums(delta2)) |
| b1_grad = t(colSums(delta1)) |
| |
| W4_grad = t(delta4) %*% H3 |
| W3_grad = t(delta3) %*% H2 |
| W2_grad = t(delta2) %*% H1 |
| W1_grad = t(delta1) %*% X |
| } |
| |
| obj = function(Matrix[Double] E) return(Double val){ |
| val = 0.5 * sum(E^2) |
| } |
| |
| batch_size = ifdef($BATCH, 256) |
| mu = ifdef($MOMENTUM, 0.9) |
| step = ifdef($STEP, 1e-5) |
| decay = ifdef($DECAY, 0.95) |
| hfile = ifdef($HIDDEN, " ") |
| fmt = ifdef($fmt, "text") |
| full_obj = ifdef($OBJ, FALSE) |
| |
| X = read($X) |
| num_hidden1 = $H1 |
| num_hidden2 = $H2 |
| max_epochs = $EPOCH |
| |
| n = nrow(X) |
| m = ncol(X) |
| |
| #randomly reordering rows |
| permut = table(seq(1,n,1), order(target=Rand(rows=n, cols=1, min=0, max=1, pdf="uniform"), by=1, index.return=TRUE), n, n) |
| X = permut %*% X |
| |
| #z-transform, whitening operator is better |
| means = colSums(X)/n |
| stds = sqrt((colSums(X^2)/n - means*means)*n/(n-1)) + 1e-17 |
| X = (X - means)/stds |
| |
| W1 = sqrt(6)/sqrt(m + num_hidden1) * Rand(rows=num_hidden1, cols=m, min=-1, max=1, pdf="uniform") |
| b1 = matrix(0, rows=num_hidden1, cols=1) |
| W2 = sqrt(6)/sqrt(num_hidden1 + num_hidden2) * Rand(rows=num_hidden2, cols=num_hidden1, min=-1, max=1, pdf="uniform") |
| b2 = matrix(0, rows=num_hidden2, cols=1) |
| W3 = sqrt(6)/sqrt(num_hidden2 + num_hidden1) * Rand(rows=num_hidden1, cols=num_hidden2, min=-1, max=1, pdf="uniform") |
| b3 = matrix(0, rows=num_hidden1, cols=1) |
| W4 = sqrt(6)/sqrt(num_hidden2 + m) * Rand(rows=m, cols=num_hidden1, min=-1, max=1, pdf="uniform") |
| b4 = matrix(0, rows=m, cols=1) |
| |
| upd_W1 = matrix(0, rows=nrow(W1), cols=ncol(W1)) |
| upd_b1 = matrix(0, rows=nrow(b1), cols=ncol(b1)) |
| upd_W2 = matrix(0, rows=nrow(W2), cols=ncol(W2)) |
| upd_b2 = matrix(0, rows=nrow(b2), cols=ncol(b2)) |
| upd_W3 = matrix(0, rows=nrow(W3), cols=ncol(W3)) |
| upd_b3 = matrix(0, rows=nrow(b3), cols=ncol(b3)) |
| upd_W4 = matrix(0, rows=nrow(W4), cols=ncol(W4)) |
| upd_b4 = matrix(0, rows=nrow(b4), cols=ncol(b4)) |
| |
| if( full_obj ){ |
| [full_H1, full_H1_prime, full_H2, full_H2_prime, full_H3, full_H3_prime, full_Yhat, full_Yhat_prime, full_E] = feedForward(X, W1, b1, W2, b2, W3, b3, W4, b4, X) |
| full_o = obj(full_E) |
| print("EPOCHS=" + 0 + " OBJ (FULL DATA): " + full_o) |
| } |
| |
| iter = 0 |
| num_iters_per_epoch = ceil(n / batch_size) |
| max_iterations = max_epochs * num_iters_per_epoch |
| #print("num_iters_per_epoch=" + num_iters_per_epoch + " max_iterations=" + max_iterations) |
| beg = 1 |
| while( iter < max_iterations ){ |
| end = beg + batch_size - 1 |
| if(end > n) end = n |
| X_batch = X[beg:end,] |
| |
| [H1, H1_prime, H2, H2_prime, H3, H3_prime, Yhat, Yhat_prime, E] = feedForward(X_batch, W1, b1, W2, b2, W3, b3, W4, b4, X_batch) |
| [W1_grad, b1_grad, W2_grad, b2_grad, W3_grad, b3_grad, W4_grad, b4_grad] = grad(X_batch, H1, H1_prime, H2, H2_prime, H3, H3_prime, Yhat_prime, E, W1, W2, W3, W4) |
| |
| o = obj(E) |
| print("epochs=%5.4f BATCH beg=%d end=%d obj=%f", (iter / num_iters_per_epoch), beg, end, o) |
| |
| #update |
| local_step = step / nrow(X_batch) |
| upd_W1 = mu * upd_W1 - local_step * W1_grad |
| upd_b1 = mu * upd_b1 - local_step * b1 |
| upd_W2 = mu * upd_W2 - local_step * W2_grad |
| upd_b2 = mu * upd_b2 - local_step * b2 |
| upd_W3 = mu * upd_W3 - local_step * W3_grad |
| upd_b3 = mu * upd_b3 - local_step * b3 |
| upd_W4 = mu * upd_W4 - local_step * W4_grad |
| upd_b4 = mu * upd_b4 - local_step * b4 |
| W1 = W1 + upd_W1 |
| b1 = b1 + upd_b1 |
| W2 = W2 + upd_W2 |
| b2 = b2 + upd_b2 |
| W3 = W3 + upd_W3 |
| b3 = b3 + upd_b3 |
| W4 = W4 + upd_W4 |
| b4 = b4 + upd_b4 |
| |
| iter = iter + 1 |
| if(end == n) beg = 1 |
| else beg = end + 1 |
| |
| if( iter %% num_iters_per_epoch == 0 ) step = step * decay |
| |
| if( full_obj & iter %% num_iters_per_epoch == 0 ){ |
| [full_H1, full_H1_prime, full_H2, full_H2_prime, full_H3, full_H3_prime, full_Yhat, full_Yhat_prime, full_E] = feedForward(X, W1, b1, W2, b2, W3, b3, W4, b4, X) |
| full_o = obj(full_E) |
| epochs = iter %/% num_iters_per_epoch |
| print("EPOCHS=" + epochs + " iter=" + iter + " OBJ (FULL DATA)=" + full_o) |
| } |
| } |
| |
| write(W1, $W1_out, format=fmt) |
| write(b1, $b1_out, format=fmt) |
| write(W2, $W2_out, format=fmt) |
| write(b2, $b2_out, format=fmt) |
| write(W3, $W3_out, format=fmt) |
| write(b3, $b3_out, format=fmt) |
| write(W4, $W4_out, format=fmt) |
| write(b4, $b4_out, format=fmt) |
| |
| if( hfile != " " ){ |
| [full_H1, full_H1_prime, full_H2, full_H2_prime, full_H3, full_H3_prime, full_Yhat, full_Yhat_prime, full_E] = feedForward(X, W1, b1, W2, b2, W3, b3, W4, b4, X) |
| reordered_H = t(permut) %*% full_H2 |
| write(reordered_H, hfile, format=fmt) |
| } |