scripts/staging/autoencoder-2layer.dml - systemds - Git at Google

 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 #-------------------------------------------------------------

 # Trains a 2-layer autoencoder with minibatch SGD, momentum and step-size decay.
 # If invoked with H1 > H2 then it becomes a 'bowtie' structured autoencoder
 # Weights are initialized using Glorot & Bengio (2010) AISTATS initialization.
 # The script standardizes the input before training (can be turned off).
 # Also, it randomly reshuffles rows before training.
 # Currently, tanh is set to be the activation function.
 # By re-implementing 'func' DML-bodied function, one can change the activation.

 # INPUT PARAMETERS:
 # ---------------------------------------------------------------------------------------------
 # NAME   	TYPE     DEFAULT  MEANING
 # ---------------------------------------------------------------------------------------------
 # X         String   ---      Filename where the input is stored
 # H1        Int      ---      Number of neurons in the 1st hidden layer
 # H2        Int      ---      Number of neurons in the 2nd hidden layer
 # EPOCH     Int      ---      Number of epochs to train for
 # fmt       String   'text'   Output format ("text", "csv", "binary" etc.)
 # OBJ       Boolean  FALSE    If TRUE, Computes objective function value (squared-loss)
 #                             at the end of each epoch. Note that, computing the full
 #                             objective can take a lot of time.
 # BATCH     Int      256      Mini-batch size (training parameter)
 # STEP      Double   1e-5     Initial step size (training parameter)
 # DECAY     Double   0.95     Decays step size after each epoch (training parameter)
 # MOMENTUM  Double   0.9      Momentum parameter (training parameter)
 # ---------------------------------------------------------------------------------------------
 #
 # OUTPUT PARAMETERS (all filenames):
 # ---------------------------------------------------------------------------------------------
 # NAME     TYPE   DEFAULT  MEANING
 # ---------------------------------------------------------------------------------------------
 # W1_out   String   ---   File to store weights between input layer and 1st hidden layer
 # b1_out   String   ---   File to store bias between input layer and 1st hidden layer
 # W2_out   String   ---   File to store weights between 1st hidden layer and 2nd hidden layer
 # b2_out   String   ---   File to store bias between 1st hidden layer and 2nd hidden layer
 # W3_out   String   ---   File to store weights between 2nd hidden layer and 3rd hidden layer
 # b3_out   String   ---   File to store bias between 2nd hidden layer and 3rd hidden layer
 # W4_out   String   ---   File to store weights between 3rd hidden layer and output layer
 # b4_out   String   ---   File to store bias between 3rd hidden layer and output layer
 # HIDDEN   String   " "   File to store the hidden (2nd) layer representation if needed
 # ---------------------------------------------------------------------------------------------
 #
 # INVOCATION:
 # -f autoencoder_2layer.dml --nvargs X=<input file> H1=500 H2=2 EPOCH=1 fmt="csv"
 #                                    W1_out=<weights from input to 1st hidden layer> b1_out=<bias from input to 1st hidden layer>
 #									 W2_out=<weights from 1st hidden layer to 2nd hidden layer> b2_out=<bias from 1st hidden layer to 2nd hidden layer>
 #									 W3_out=<weights from 2nd hidden layer to 3rd hidden layer> b3_out=<bias from 2nd hidden layer to 3rd hidden layer>
 #									 W4_out=<weights from 3rd hidden layer to output> b4_out=<bias from 3rd hidden layer to output>
 #

 #implements tanh
 #to use another activation fn, implement a DML-bodied function with
 #function name 'func' and comment out this one
 func = function(Matrix[Double] X) return(Matrix[Double] Y, Matrix[Double] Y_prime){
      Y = (exp(2*X) - 1)/(exp(2*X) + 1)
      Y_prime = 1 - Y^2
 }

 feedForward = function(Matrix[Double] X,
 	               Matrix[Double] W1, Matrix[Double] b1,
 	               Matrix[Double] W2, Matrix[Double] b2,
 	               Matrix[Double] W3, Matrix[Double] b3,
 	               Matrix[Double] W4, Matrix[Double] b4,
 		       Matrix[Double] Y)
       	      return(Matrix[Double] H1, Matrix[Double] H1_prime,
 	     	     Matrix[Double] H2, Matrix[Double] H2_prime,
 	     	     Matrix[Double] H3, Matrix[Double] H3_prime,
 	     	     Matrix[Double] Yhat, Matrix[Double] Yhat_prime,
 	     	     Matrix[Double] E){
       H1_in = t(W1 %*% t(X) + b1)
       [H1, H1_prime] = func(H1_in)

       H2_in = t(W2 %*% t(H1) + b2)
       [H2, H2_prime] = func(H2_in)

       H3_in = t(W3 %*% t(H2) + b3)
       [H3, H3_prime] = func(H3_in)

       Yhat_in = t(W4 %*% t(H3) + b4)
       [Yhat, Yhat_prime] = func(Yhat_in)
       E = Yhat - Y
 }

 grad = function(Matrix[Double] X,
        		Matrix[Double] H1, Matrix[Double] H1_prime,
        		Matrix[Double] H2, Matrix[Double] H2_prime,
        		Matrix[Double] H3, Matrix[Double] H3_prime,
 		Matrix[Double] Yhat_prime,
 		Matrix[Double] E,
 		Matrix[Double] W1, Matrix[Double] W2, Matrix[Double] W3, Matrix[Double] W4)
        return(Matrix[Double] W1_grad, Matrix[Double] b1_grad,
               Matrix[Double] W2_grad, Matrix[Double] b2_grad,
 	      Matrix[Double] W3_grad, Matrix[Double] b3_grad,
 	      Matrix[Double] W4_grad, Matrix[Double] b4_grad){
       #backprop
       delta4 = E * Yhat_prime
       delta3 = H3_prime * (delta4 %*% W4)
       delta2 = H2_prime * (delta3 %*% W3)
       delta1 = H1_prime * (delta2 %*% W2)

       #compute gradients
       b4_grad = t(colSums(delta4))
       b3_grad = t(colSums(delta3))
       b2_grad = t(colSums(delta2))
       b1_grad = t(colSums(delta1))

       W4_grad = t(delta4) %*% H3
       W3_grad = t(delta3) %*% H2
       W2_grad = t(delta2) %*% H1
       W1_grad = t(delta1) %*% X
 }

 obj = function(Matrix[Double] E) return(Double val){
     val = 0.5 * sum(E^2)
 }

 batch_size = ifdef($BATCH, 256)
 mu = ifdef($MOMENTUM, 0.9)
 step = ifdef($STEP, 1e-5)
 decay = ifdef($DECAY, 0.95)
 hfile = ifdef($HIDDEN, " ")
 fmt = ifdef($fmt, "text")
 full_obj = ifdef($OBJ, FALSE)

 X = read($X)
 num_hidden1 = $H1
 num_hidden2 = $H2
 max_epochs = $EPOCH

 n = nrow(X)
 m = ncol(X)

 #randomly reordering rows
 permut = table(seq(1,n,1), order(target=Rand(rows=n, cols=1, min=0, max=1, pdf="uniform"), by=1, index.return=TRUE), n, n)
 X = permut %*% X

 #z-transform, whitening operator is better
 means = colSums(X)/n
 stds = sqrt((colSums(X^2)/n - means*means)*n/(n-1)) + 1e-17
 X = (X - means)/stds

 W1 = sqrt(6)/sqrt(m + num_hidden1) * Rand(rows=num_hidden1, cols=m, min=-1, max=1, pdf="uniform")
 b1 = matrix(0, rows=num_hidden1, cols=1)
 W2 = sqrt(6)/sqrt(num_hidden1 + num_hidden2) * Rand(rows=num_hidden2, cols=num_hidden1, min=-1, max=1, pdf="uniform")
 b2 = matrix(0, rows=num_hidden2, cols=1)
 W3 = sqrt(6)/sqrt(num_hidden2 + num_hidden1) * Rand(rows=num_hidden1, cols=num_hidden2, min=-1, max=1, pdf="uniform")
 b3 = matrix(0, rows=num_hidden1, cols=1)
 W4 = sqrt(6)/sqrt(num_hidden2 + m) * Rand(rows=m, cols=num_hidden1, min=-1, max=1, pdf="uniform")
 b4 = matrix(0, rows=m, cols=1)

 upd_W1 = matrix(0, rows=nrow(W1), cols=ncol(W1))
 upd_b1 = matrix(0, rows=nrow(b1), cols=ncol(b1))
 upd_W2 = matrix(0, rows=nrow(W2), cols=ncol(W2))
 upd_b2 = matrix(0, rows=nrow(b2), cols=ncol(b2))
 upd_W3 = matrix(0, rows=nrow(W3), cols=ncol(W3))
 upd_b3 = matrix(0, rows=nrow(b3), cols=ncol(b3))
 upd_W4 = matrix(0, rows=nrow(W4), cols=ncol(W4))
 upd_b4 = matrix(0, rows=nrow(b4), cols=ncol(b4))

 if( full_obj ){
       [full_H1, full_H1_prime, full_H2, full_H2_prime, full_H3, full_H3_prime, full_Yhat, full_Yhat_prime, full_E] = feedForward(X, W1, b1, W2, b2, W3, b3, W4, b4, X)
       full_o = obj(full_E)
       print("EPOCHS=" + 0 + " OBJ (FULL DATA): " + full_o)
 }

 iter = 0
 num_iters_per_epoch = ceil(n / batch_size)
 max_iterations = max_epochs * num_iters_per_epoch
 #print("num_iters_per_epoch=" + num_iters_per_epoch + " max_iterations=" + max_iterations)
 beg = 1
 while( iter < max_iterations ){
       end = beg + batch_size - 1
       if(end > n) end = n
       X_batch = X[beg:end,]

       [H1, H1_prime, H2, H2_prime, H3, H3_prime, Yhat, Yhat_prime, E] = feedForward(X_batch, W1, b1, W2, b2, W3, b3, W4, b4, X_batch)
       [W1_grad, b1_grad, W2_grad, b2_grad, W3_grad, b3_grad, W4_grad, b4_grad] = grad(X_batch, H1, H1_prime, H2, H2_prime, H3, H3_prime, Yhat_prime, E, W1, W2, W3, W4)

       o = obj(E)
       print("epochs=%5.4f BATCH beg=%d end=%d obj=%f", (iter / num_iters_per_epoch), beg, end, o)

       #update
       local_step = step / nrow(X_batch)
       upd_W1 = mu * upd_W1 - local_step * W1_grad
       upd_b1 = mu * upd_b1 - local_step * b1
       upd_W2 = mu * upd_W2 - local_step * W2_grad
       upd_b2 = mu * upd_b2 - local_step * b2
       upd_W3 = mu * upd_W3 - local_step * W3_grad
       upd_b3 = mu * upd_b3 - local_step * b3
       upd_W4 = mu * upd_W4 - local_step * W4_grad
       upd_b4 = mu * upd_b4 - local_step * b4
       W1 = W1 + upd_W1
       b1 = b1 + upd_b1
       W2 = W2 + upd_W2
       b2 = b2 + upd_b2
       W3 = W3 + upd_W3
       b3 = b3 + upd_b3
       W4 = W4 + upd_W4
       b4 = b4 + upd_b4

       iter = iter + 1
       if(end == n) beg = 1
       else beg = end + 1

       if( iter %% num_iters_per_epoch == 0 ) step = step * decay

       if( full_obj & iter %% num_iters_per_epoch == 0 ){
       	  [full_H1, full_H1_prime, full_H2, full_H2_prime, full_H3, full_H3_prime, full_Yhat, full_Yhat_prime, full_E] = feedForward(X, W1, b1, W2, b2, W3, b3, W4, b4, X)
           full_o = obj(full_E)
       	  epochs = iter %/% num_iters_per_epoch
           print("EPOCHS=" + epochs + " iter=" + iter + " OBJ (FULL DATA)=" + full_o)
       }
 }

 write(W1, $W1_out, format=fmt)
 write(b1, $b1_out, format=fmt)
 write(W2, $W2_out, format=fmt)
 write(b2, $b2_out, format=fmt)
 write(W3, $W3_out, format=fmt)
 write(b3, $b3_out, format=fmt)
 write(W4, $W4_out, format=fmt)
 write(b4, $b4_out, format=fmt)

 if( hfile != " " ){
       [full_H1, full_H1_prime, full_H2, full_H2_prime, full_H3, full_H3_prime, full_Yhat, full_Yhat_prime, full_E] = feedForward(X, W1, b1, W2, b2, W3, b3, W4, b4, X)
       reordered_H = t(permut) %*% full_H2
       write(reordered_H, hfile, format=fmt)
 }
	#-------------------------------------------------------------
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#
	#-------------------------------------------------------------

	# Trains a 2-layer autoencoder with minibatch SGD, momentum and step-size decay.
	# If invoked with H1 > H2 then it becomes a 'bowtie' structured autoencoder
	# Weights are initialized using Glorot & Bengio (2010) AISTATS initialization.
	# The script standardizes the input before training (can be turned off).
	# Also, it randomly reshuffles rows before training.
	# Currently, tanh is set to be the activation function.
	# By re-implementing 'func' DML-bodied function, one can change the activation.

	# INPUT PARAMETERS:
	# ---------------------------------------------------------------------------------------------
	# NAME TYPE DEFAULT MEANING
	# ---------------------------------------------------------------------------------------------
	# X String --- Filename where the input is stored
	# H1 Int --- Number of neurons in the 1st hidden layer
	# H2 Int --- Number of neurons in the 2nd hidden layer
	# EPOCH Int --- Number of epochs to train for
	# fmt String 'text' Output format ("text", "csv", "binary" etc.)
	# OBJ Boolean FALSE If TRUE, Computes objective function value (squared-loss)
	# at the end of each epoch. Note that, computing the full
	# objective can take a lot of time.
	# BATCH Int 256 Mini-batch size (training parameter)
	# STEP Double 1e-5 Initial step size (training parameter)
	# DECAY Double 0.95 Decays step size after each epoch (training parameter)
	# MOMENTUM Double 0.9 Momentum parameter (training parameter)
	# ---------------------------------------------------------------------------------------------
	#
	# OUTPUT PARAMETERS (all filenames):
	# ---------------------------------------------------------------------------------------------
	# NAME TYPE DEFAULT MEANING
	# ---------------------------------------------------------------------------------------------
	# W1_out String --- File to store weights between input layer and 1st hidden layer
	# b1_out String --- File to store bias between input layer and 1st hidden layer
	# W2_out String --- File to store weights between 1st hidden layer and 2nd hidden layer
	# b2_out String --- File to store bias between 1st hidden layer and 2nd hidden layer
	# W3_out String --- File to store weights between 2nd hidden layer and 3rd hidden layer
	# b3_out String --- File to store bias between 2nd hidden layer and 3rd hidden layer
	# W4_out String --- File to store weights between 3rd hidden layer and output layer
	# b4_out String --- File to store bias between 3rd hidden layer and output layer
	# HIDDEN String " " File to store the hidden (2nd) layer representation if needed
	# ---------------------------------------------------------------------------------------------
	#
	# INVOCATION:
	# -f autoencoder_2layer.dml --nvargs X=<input file> H1=500 H2=2 EPOCH=1 fmt="csv"
	# W1_out=<weights from input to 1st hidden layer> b1_out=<bias from input to 1st hidden layer>
	# W2_out=<weights from 1st hidden layer to 2nd hidden layer> b2_out=<bias from 1st hidden layer to 2nd hidden layer>
	# W3_out=<weights from 2nd hidden layer to 3rd hidden layer> b3_out=<bias from 2nd hidden layer to 3rd hidden layer>
	# W4_out=<weights from 3rd hidden layer to output> b4_out=<bias from 3rd hidden layer to output>
	#

	#implements tanh
	#to use another activation fn, implement a DML-bodied function with
	#function name 'func' and comment out this one
	func = function(Matrix[Double] X) return(Matrix[Double] Y, Matrix[Double] Y_prime){
	Y = (exp(2X) - 1)/(exp(2X) + 1)
	Y_prime = 1 - Y^2
	}

	feedForward = function(Matrix[Double] X,
	Matrix[Double] W1, Matrix[Double] b1,
	Matrix[Double] W2, Matrix[Double] b2,
	Matrix[Double] W3, Matrix[Double] b3,
	Matrix[Double] W4, Matrix[Double] b4,
	Matrix[Double] Y)
	return(Matrix[Double] H1, Matrix[Double] H1_prime,
	Matrix[Double] H2, Matrix[Double] H2_prime,
	Matrix[Double] H3, Matrix[Double] H3_prime,
	Matrix[Double] Yhat, Matrix[Double] Yhat_prime,
	Matrix[Double] E){
	H1_in = t(W1 %*% t(X) + b1)
	[H1, H1_prime] = func(H1_in)

	H2_in = t(W2 %*% t(H1) + b2)
	[H2, H2_prime] = func(H2_in)

	H3_in = t(W3 %*% t(H2) + b3)
	[H3, H3_prime] = func(H3_in)

	Yhat_in = t(W4 %*% t(H3) + b4)
	[Yhat, Yhat_prime] = func(Yhat_in)
	E = Yhat - Y
	}

	grad = function(Matrix[Double] X,
	Matrix[Double] H1, Matrix[Double] H1_prime,
	Matrix[Double] H2, Matrix[Double] H2_prime,
	Matrix[Double] H3, Matrix[Double] H3_prime,
	Matrix[Double] Yhat_prime,
	Matrix[Double] E,
	Matrix[Double] W1, Matrix[Double] W2, Matrix[Double] W3, Matrix[Double] W4)
	return(Matrix[Double] W1_grad, Matrix[Double] b1_grad,
	Matrix[Double] W2_grad, Matrix[Double] b2_grad,
	Matrix[Double] W3_grad, Matrix[Double] b3_grad,
	Matrix[Double] W4_grad, Matrix[Double] b4_grad){
	#backprop
	delta4 = E * Yhat_prime
	delta3 = H3_prime * (delta4 %*% W4)
	delta2 = H2_prime * (delta3 %*% W3)
	delta1 = H1_prime * (delta2 %*% W2)

	#compute gradients
	b4_grad = t(colSums(delta4))
	b3_grad = t(colSums(delta3))
	b2_grad = t(colSums(delta2))
	b1_grad = t(colSums(delta1))

	W4_grad = t(delta4) %*% H3
	W3_grad = t(delta3) %*% H2
	W2_grad = t(delta2) %*% H1
	W1_grad = t(delta1) %*% X
	}

	obj = function(Matrix[Double] E) return(Double val){
	val = 0.5 * sum(E^2)
	}

	batch_size = ifdef($BATCH, 256)
	mu = ifdef($MOMENTUM, 0.9)
	step = ifdef($STEP, 1e-5)
	decay = ifdef($DECAY, 0.95)
	hfile = ifdef($HIDDEN, " ")
	fmt = ifdef($fmt, "text")
	full_obj = ifdef($OBJ, FALSE)

	X = read($X)
	num_hidden1 = $H1
	num_hidden2 = $H2
	max_epochs = $EPOCH

	n = nrow(X)
	m = ncol(X)

	#randomly reordering rows
	permut = table(seq(1,n,1), order(target=Rand(rows=n, cols=1, min=0, max=1, pdf="uniform"), by=1, index.return=TRUE), n, n)
	X = permut %*% X

	#z-transform, whitening operator is better
	means = colSums(X)/n
	stds = sqrt((colSums(X^2)/n - meansmeans)n/(n-1)) + 1e-17
	X = (X - means)/stds

	W1 = sqrt(6)/sqrt(m + num_hidden1) * Rand(rows=num_hidden1, cols=m, min=-1, max=1, pdf="uniform")
	b1 = matrix(0, rows=num_hidden1, cols=1)
	W2 = sqrt(6)/sqrt(num_hidden1 + num_hidden2) * Rand(rows=num_hidden2, cols=num_hidden1, min=-1, max=1, pdf="uniform")
	b2 = matrix(0, rows=num_hidden2, cols=1)
	W3 = sqrt(6)/sqrt(num_hidden2 + num_hidden1) * Rand(rows=num_hidden1, cols=num_hidden2, min=-1, max=1, pdf="uniform")
	b3 = matrix(0, rows=num_hidden1, cols=1)
	W4 = sqrt(6)/sqrt(num_hidden2 + m) * Rand(rows=m, cols=num_hidden1, min=-1, max=1, pdf="uniform")
	b4 = matrix(0, rows=m, cols=1)

	upd_W1 = matrix(0, rows=nrow(W1), cols=ncol(W1))
	upd_b1 = matrix(0, rows=nrow(b1), cols=ncol(b1))
	upd_W2 = matrix(0, rows=nrow(W2), cols=ncol(W2))
	upd_b2 = matrix(0, rows=nrow(b2), cols=ncol(b2))
	upd_W3 = matrix(0, rows=nrow(W3), cols=ncol(W3))
	upd_b3 = matrix(0, rows=nrow(b3), cols=ncol(b3))
	upd_W4 = matrix(0, rows=nrow(W4), cols=ncol(W4))
	upd_b4 = matrix(0, rows=nrow(b4), cols=ncol(b4))

	if( full_obj ){
	[full_H1, full_H1_prime, full_H2, full_H2_prime, full_H3, full_H3_prime, full_Yhat, full_Yhat_prime, full_E] = feedForward(X, W1, b1, W2, b2, W3, b3, W4, b4, X)
	full_o = obj(full_E)
	print("EPOCHS=" + 0 + " OBJ (FULL DATA): " + full_o)
	}

	iter = 0
	num_iters_per_epoch = ceil(n / batch_size)
	max_iterations = max_epochs * num_iters_per_epoch
	#print("num_iters_per_epoch=" + num_iters_per_epoch + " max_iterations=" + max_iterations)
	beg = 1
	while( iter < max_iterations ){
	end = beg + batch_size - 1
	if(end > n) end = n
	X_batch = X[beg:end,]

	[H1, H1_prime, H2, H2_prime, H3, H3_prime, Yhat, Yhat_prime, E] = feedForward(X_batch, W1, b1, W2, b2, W3, b3, W4, b4, X_batch)
	[W1_grad, b1_grad, W2_grad, b2_grad, W3_grad, b3_grad, W4_grad, b4_grad] = grad(X_batch, H1, H1_prime, H2, H2_prime, H3, H3_prime, Yhat_prime, E, W1, W2, W3, W4)

	o = obj(E)
	print("epochs=%5.4f BATCH beg=%d end=%d obj=%f", (iter / num_iters_per_epoch), beg, end, o)

	#update
	local_step = step / nrow(X_batch)
	upd_W1 = mu * upd_W1 - local_step * W1_grad
	upd_b1 = mu * upd_b1 - local_step * b1
	upd_W2 = mu * upd_W2 - local_step * W2_grad
	upd_b2 = mu * upd_b2 - local_step * b2
	upd_W3 = mu * upd_W3 - local_step * W3_grad
	upd_b3 = mu * upd_b3 - local_step * b3
	upd_W4 = mu * upd_W4 - local_step * W4_grad
	upd_b4 = mu * upd_b4 - local_step * b4
	W1 = W1 + upd_W1
	b1 = b1 + upd_b1
	W2 = W2 + upd_W2
	b2 = b2 + upd_b2
	W3 = W3 + upd_W3
	b3 = b3 + upd_b3
	W4 = W4 + upd_W4
	b4 = b4 + upd_b4

	iter = iter + 1
	if(end == n) beg = 1
	else beg = end + 1

	if( iter %% num_iters_per_epoch == 0 ) step = step * decay

	if( full_obj & iter %% num_iters_per_epoch == 0 ){
	[full_H1, full_H1_prime, full_H2, full_H2_prime, full_H3, full_H3_prime, full_Yhat, full_Yhat_prime, full_E] = feedForward(X, W1, b1, W2, b2, W3, b3, W4, b4, X)
	full_o = obj(full_E)
	epochs = iter %/% num_iters_per_epoch
	print("EPOCHS=" + epochs + " iter=" + iter + " OBJ (FULL DATA)=" + full_o)
	}
	}

	write(W1, $W1_out, format=fmt)
	write(b1, $b1_out, format=fmt)
	write(W2, $W2_out, format=fmt)
	write(b2, $b2_out, format=fmt)
	write(W3, $W3_out, format=fmt)
	write(b3, $b3_out, format=fmt)
	write(W4, $W4_out, format=fmt)
	write(b4, $b4_out, format=fmt)

	if( hfile != " " ){
	[full_H1, full_H1_prime, full_H2, full_H2_prime, full_H3, full_H3_prime, full_Yhat, full_Yhat_prime, full_E] = feedForward(X, W1, b1, W2, b2, W3, b3, W4, b4, X)
	reordered_H = t(permut) %*% full_H2
	write(reordered_H, hfile, format=fmt)
	}