scripts/staging/NCF.dml - systemds - Git at Google

 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 #-------------------------------------------------------------

 #
 # Neural Collaborative Filtering
 #

 # Imports
 source("nn/optim/adam.dml") as adam
 source("nn/layers/relu.dml") as relu
 source("nn/layers/sigmoid.dml") as sigmoid
 source("nn/layers/affine.dml") as affine
 source("nn/layers/log_loss.dml") as log_loss
 source("nn/layers/l2_reg.dml") as l2_reg

 train = function( matrix[double] users_train,
                   matrix[double] items_train,
                   matrix[double] targets_train,
                   matrix[double] users_val,
                   matrix[double] items_val,
                   matrix[double] targets_val,
                   integer epochs,
                   integer batch_size,
                   integer E,
                   integer D1,
                   integer D2,
                   integer D3)
     return (List[unknown] biases, List[unknown] weights) {
   # /*
   #  * Train NCF model
   #  *
   #  * Inputs:
   #  *  - users_train: matrix of shape K_train × M with K_train samples of one-hot encoded users
   #  *  - items_train: matrix of shape K_train × N with K_train samples of one-hot encoded items
   #  *  - targets_train: vector with K_train entries containing either 0 or 1 indicating whether the user interacted with the item
   #  *  - users_val: matrix of shape K_val × M with K_val samples of one-hot encoded users
   #  *  - items_val: matrix of shape K_val × N with K_val samples of one-hot encoded items
   #  *  - targets_val: vector with K_val entries containing either 0 or 1 indicating whether the user interacted with the item
   #  *  - epochs: number of training epochs
   #  *  - batch_size: size of the training batches
   #  *  - E:  dimension of embedding layers
   #  *  - D1: dimension of dense layer 1
   #  *  - D2: dimension of dense layer 2
   #  *  - D3: dimension of dense layer 3
   #  *
   #  * Outputs:
   #  *  - biases: list of biases
   #  *  - weights: list o weights
   #  *
   #  * Network Architecture:
   #
   # +----------------------+    +----------------------+
   # |User Embedding [users]|    |Item Embedding [items]|
   # +----------+-----------+    +---------+------------+
   #           |                          |
   #           |                          |
   #           |       +-----------+      |
   #           +------>+Concatenate+<-----+
   #                   +-----+-----+
   #                         |
   #                         v
   #                     +----+----+
   #                     | Dense 1 |
   #                     | (ReLU)  |
   #                     +----+----+
   #                         |
   #                         v
   #                     +----+----+
   #                     | Dense 2 |
   #                     | (ReLU)  |
   #                     +----+----+
   #                         |
   #                         v
   #                     +----+----+
   #                     | Dense 3 |
   #                     | (ReLU)  |
   #                     +----+----+
   #                         |
   #                         v
   #                   +-----+-----+
   #                   |Prediction |
   #                   |(Sigmoid)  |
   #                   +-----------+
   #  *
   #  */

   # sanity checks
   assert(nrow(items_train) == nrow(users_train));
   assert(nrow(users_train) == nrow(targets_train));
   assert(nrow(items_val) == nrow(users_val));
   assert(nrow(users_val) == nrow(targets_val));
   assert(ncol(items_val) == ncol(items_train));
   assert(ncol(users_val) == ncol(users_train));

   assert(ncol(targets_val) == ncol(targets_train));
   assert(ncol(targets_train) == 1);

   K_train = nrow(targets_train); # number of training samples
   K_val = nrow(targets_val); # number of validation samples

   N = ncol(items_train); # number items
   M = ncol(users_train); # number users

   print("NCF training starting with "
           + K_train + " training samples, "
           + K_val + " validation samples, "
           + N + " items and "
           + M + " users...");

   # 1.initialize layers
   [W_U,  b_U] = affine::init(M, E); # user embedding
   [W_I,  b_I] = affine::init(N, E); # item embedding

   [W_D1, b_D1] = affine::init(2 * E, D1); # dense layer 1
   [W_D2, b_D2] = affine::init(D1, D2);    # dense layer 2
   [W_D3, b_D3] = affine::init(D2, D3);    # dense layer 3

   [W_F,  b_F] = affine::init(D3, 1); # final prediction

   # initialize bias and weight lists
   biases = list(b_U, b_I, b_D1, b_D2, b_D3, b_F);
   weights = list(W_U, W_I, W_D1, W_D2, W_D3, W_F);

   # 2.initialize adam optimizer
   ## Default values for some parameters
   lr      = 0.001;
   beta1   = 0.9;       # [0, 1)
   beta2   = 0.999;     # [0, 1)
   epsilon = 0.0000001;
   t       = 0;

   # (1) user embedding
   [mW_U, vW_U] = adam::init(W_U);
   [mb_U, vb_U] = adam::init(b_U);

   # (1) item embedding
   [mW_I, vW_I] = adam::init(W_I);
   [mb_I, vb_I] = adam::init(b_I);

   # (2) Dense 1
   [mW_D1, vW_D1] = adam::init(W_D1);
   [mb_D1, vb_D1] = adam::init(b_D1);

   # (3) Dense 2
   [mW_D2, vW_D2] = adam::init(W_D2);
   [mb_D2, vb_D2] = adam::init(b_D2);

   # (3) Dense 3
   [mW_D3, vW_D3] = adam::init(W_D3);
   [mb_D3, vb_D3] = adam::init(b_D3);

   # (N) final prediction
   [mW_F, vW_F] = adam::init(W_F);
   [mb_F, vb_F] = adam::init(b_F);

   # Optimize
   N = K_train;
   iters = ceil(N / batch_size);

   for (e in 1:epochs) {
     for (i in 1:iters) {
       # Get the next batch
       beg = ((i-1) * batch_size) %% N + 1;
       end = min(N, beg + batch_size - 1);

       items_batch = items_train[beg:end,];
       users_batch = users_train[beg:end,];
       y_batch = targets_train[beg:end,];

       # 3.Send inputs through layers and get activations
       [out_FA, out_F, out_D1A, out_D1, out_D2A, out_D2, out_D3A, out_D3, out_concat, out_U, out_I] =
         predict(users_batch, items_batch, biases, weights);

       # 4.compute final error gradients
       # params: (predictions, targets)
       dout = log_loss::backward(out_FA, y_batch);

       # Compute loss & accuracy for training & validation data in the last iteration
       if (i %% 100 == 1) {
         # Compute training loss & accuracy
         [loss, accuracy] = eval(out_FA, y_batch);

         # Compute validation loss & accuracy
         out_FA_val = predict(users_val, items_val, biases, weights);
         [loss_val, accuracy_val] = eval(out_FA_val, targets_val);

         # Output results
         print("Epoch: " + e + ", Iter: " + i + ", Train Loss: " + loss + ", Train Accuracy: "
                 + accuracy + ", Val Loss: " + loss_val + ", Val Accuracy: " + accuracy_val)
       }

       # 5.Backpropagation
       # params: (gradient from upstream, activation, weights, biases)
       dout_FA = sigmoid::backward(dout, out_F);
       [dout_F, dW_F, db_F] = affine::backward(dout_FA, out_D3A, W_F, b_F);

       dout_D3A = relu::backward(dout_F, out_D3);
       [dout_D3, dW_D3, db_D3] = affine::backward(dout_D3A, out_D2A, W_D3, b_D3);

       dout_D2A = relu::backward(dout_D3, out_D2);
       [dout_D2, dW_D2, db_D2] = affine::backward(dout_D2A, out_D1A, W_D2, b_D2);

       dout_D1A = relu::backward(dout_D2, out_D1);
       [dout_D1, dW_D1, db_D1] = affine::backward(dout_D1A, out_concat, W_D1, b_D1);

       # backprop concatenation: split the gradients up
       dout_U = dout_D1[,1:E];
       dout_I = dout_D1[,E+1:2*E];

       [dUsers, dW_U, db_U] = affine::backward(dout_U, users_batch, W_U, b_U);
       [dItems, dW_I, db_I] = affine::backward(dout_I, items_batch, W_I, b_I);

       # 6.update timestep
       t = e * i - 1;

       # 7.Call adam::update for all parameters
       [b_U, mb_U, vb_U] = adam::update(b_U, db_U, lr, beta1, beta2, epsilon, t, mb_U, vb_U);
       [W_U, mW_U, vW_U] = adam::update(W_U, dW_U, lr, beta1, beta2, epsilon, t, mW_U, vW_U);

       [b_I, mb_I, vb_I] = adam::update(b_I, db_I, lr, beta1, beta2, epsilon, t, mb_I, vb_I);
       [W_I, mW_I, vW_I] = adam::update(W_I, dW_I, lr, beta1, beta2, epsilon, t, mW_I, vW_I);

       [b_D1, mb_D1, vb_D1] = adam::update(b_D1, db_D1, lr, beta1, beta2, epsilon, t, mb_D1, vb_D1);
       [W_D1, mW_D1, vW_D1] = adam::update(W_D1, dW_D1, lr, beta1, beta2, epsilon, t, mW_D1, vW_D1);

       [b_D2, mb_D2, vb_D2] = adam::update(b_D2, db_D2, lr, beta1, beta2, epsilon, t, mb_D2, vb_D2);
       [W_D2, mW_D2, vW_D2] = adam::update(W_D2, dW_D2, lr, beta1, beta2, epsilon, t, mW_D2, vW_D2);

       [b_D3, mb_D3, vb_D3] = adam::update(b_D3, db_D3, lr, beta1, beta2, epsilon, t, mb_D3, vb_D3);
       [W_D3, mW_D3, vW_D3] = adam::update(W_D3, dW_D3, lr, beta1, beta2, epsilon, t, mW_D3, vW_D3);

       [b_F, mb_F, vb_F] = adam::update(b_F, db_F, lr, beta1, beta2, epsilon, t, mb_F, vb_F);
       [W_F, mW_F, vW_F] = adam::update(W_F, dW_F, lr, beta1, beta2, epsilon, t, mW_F, vW_F);

       # 8. Update lists
       biases = list(b_U, b_I, b_D1, b_D2, b_D3, b_F);
       weights = list(W_U, W_I, W_D1, W_D2, W_D3, W_F);
     }
   }

   print("NCF training completed after " + epochs + " epochs")
 }

 predict = function(matrix[double] users, matrix[double] items, List[unknown] biases, List[unknown] weights)
     return (matrix[double] out_FA, matrix[double] out_F,
             matrix[double] out_D1A, matrix[double] out_D1,
             matrix[double] out_D2A, matrix[double] out_D2,
             matrix[double] out_D3A, matrix[double] out_D3,
             matrix[double] out_concat, matrix[double] out_U, matrix[double] out_I) {
   #
   # Computes the predictions for the given inputs.
   #
   # Inputs:
   #  - users : K user examples with E features, of shape (K, E).
   #  - items : K item examples with E features, of shape (K, E).
   #  - biases, weights : list of trained model parameters
   #
   # Outputs:
   #  - out : target vector, y.
   #

   # parse parameters
   b_U = as.matrix(biases[1]);
   b_I = as.matrix(biases[2]);
   b_D1 = as.matrix(biases[3]);
   b_D2 = as.matrix(biases[4]);
   b_D3 = as.matrix(biases[5]);
   b_F = as.matrix(biases[6]);

   W_U = as.matrix(weights[1]);
   W_I = as.matrix(weights[2]);
   W_D1 = as.matrix(weights[3]);
   W_D2 = as.matrix(weights[4]);
   W_D3 = as.matrix(weights[5]);
   W_F = as.matrix(weights[6]);

   # send inputs through layers

   # (1) User and item embeddings + concatenation
   out_U = affine::forward(users, W_U, b_U);
   out_I = affine::forward(items, W_I, b_I);

   out_concat = cbind(out_U, out_I);

   # (2) Dense layers
   out_D1 = affine::forward(out_concat, W_D1, b_D1);
   out_D1A = relu::forward(out_D1); # separate "activation" for ReLU activation function

   out_D2 = affine::forward(out_D1A, W_D2, b_D2);
   out_D2A = relu::forward(out_D2); # separate "activation" for ReLU activation function

   out_D3 = affine::forward(out_D2A, W_D3, b_D3);
   out_D3A = relu::forward(out_D3); # separate "activation" for ReLU activation function

   # (N) final prediction
   out_F = affine::forward(out_D3A, W_F, b_F);
   out_FA = sigmoid::forward(out_F);
 }

 eval = function(matrix[double] probs, matrix[double] y)
     return (double loss, double accuracy) {
    /*
     * Computes loss and accuracy.
     */

     # compute the log loss
     loss = log_loss::forward(probs, y);

     # compute accuracy
     Z = probs >= 0.5;
     accuracy = 1 - sum(abs(Z - y)) / nrow(y);
 }
	#-------------------------------------------------------------
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#
	#-------------------------------------------------------------

	#
	# Neural Collaborative Filtering
	#

	# Imports
	source("nn/optim/adam.dml") as adam
	source("nn/layers/relu.dml") as relu
	source("nn/layers/sigmoid.dml") as sigmoid
	source("nn/layers/affine.dml") as affine
	source("nn/layers/log_loss.dml") as log_loss
	source("nn/layers/l2_reg.dml") as l2_reg

	train = function( matrix[double] users_train,
	matrix[double] items_train,
	matrix[double] targets_train,
	matrix[double] users_val,
	matrix[double] items_val,
	matrix[double] targets_val,
	integer epochs,
	integer batch_size,
	integer E,
	integer D1,
	integer D2,
	integer D3)
	return (List[unknown] biases, List[unknown] weights) {
	# /*
	# * Train NCF model
	# *
	# * Inputs:
	# * - users_train: matrix of shape K_train × M with K_train samples of one-hot encoded users
	# * - items_train: matrix of shape K_train × N with K_train samples of one-hot encoded items
	# * - targets_train: vector with K_train entries containing either 0 or 1 indicating whether the user interacted with the item
	# * - users_val: matrix of shape K_val × M with K_val samples of one-hot encoded users
	# * - items_val: matrix of shape K_val × N with K_val samples of one-hot encoded items
	# * - targets_val: vector with K_val entries containing either 0 or 1 indicating whether the user interacted with the item
	# * - epochs: number of training epochs
	# * - batch_size: size of the training batches
	# * - E: dimension of embedding layers
	# * - D1: dimension of dense layer 1
	# * - D2: dimension of dense layer 2
	# * - D3: dimension of dense layer 3
	# *
	# * Outputs:
	# * - biases: list of biases
	# * - weights: list o weights
	# *
	# * Network Architecture:
	#
	# +----------------------+ +----------------------+
	# \|User Embedding [users]\| \|Item Embedding [items]\|
	# +----------+-----------+ +---------+------------+
	# \| \|
	# \| \|
	# \| +-----------+ \|
	# +------>+Concatenate+<-----+
	# +-----+-----+
	# \|
	# v
	# +----+----+
	# \| Dense 1 \|
	# \| (ReLU) \|
	# +----+----+
	# \|
	# v
	# +----+----+
	# \| Dense 2 \|
	# \| (ReLU) \|
	# +----+----+
	# \|
	# v
	# +----+----+
	# \| Dense 3 \|
	# \| (ReLU) \|
	# +----+----+
	# \|
	# v
	# +-----+-----+
	# \|Prediction \|
	# \|(Sigmoid) \|
	# +-----------+
	# *
	# */

	# sanity checks
	assert(nrow(items_train) == nrow(users_train));
	assert(nrow(users_train) == nrow(targets_train));
	assert(nrow(items_val) == nrow(users_val));
	assert(nrow(users_val) == nrow(targets_val));
	assert(ncol(items_val) == ncol(items_train));
	assert(ncol(users_val) == ncol(users_train));

	assert(ncol(targets_val) == ncol(targets_train));
	assert(ncol(targets_train) == 1);

	K_train = nrow(targets_train); # number of training samples
	K_val = nrow(targets_val); # number of validation samples

	N = ncol(items_train); # number items
	M = ncol(users_train); # number users

	print("NCF training starting with "
	+ K_train + " training samples, "
	+ K_val + " validation samples, "
	+ N + " items and "
	+ M + " users...");

	# 1.initialize layers
	[W_U, b_U] = affine::init(M, E); # user embedding
	[W_I, b_I] = affine::init(N, E); # item embedding

	[W_D1, b_D1] = affine::init(2 * E, D1); # dense layer 1
	[W_D2, b_D2] = affine::init(D1, D2); # dense layer 2
	[W_D3, b_D3] = affine::init(D2, D3); # dense layer 3

	[W_F, b_F] = affine::init(D3, 1); # final prediction

	# initialize bias and weight lists
	biases = list(b_U, b_I, b_D1, b_D2, b_D3, b_F);
	weights = list(W_U, W_I, W_D1, W_D2, W_D3, W_F);

	# 2.initialize adam optimizer
	## Default values for some parameters
	lr = 0.001;
	beta1 = 0.9; # [0, 1)
	beta2 = 0.999; # [0, 1)
	epsilon = 0.0000001;
	t = 0;

	# (1) user embedding
	[mW_U, vW_U] = adam::init(W_U);
	[mb_U, vb_U] = adam::init(b_U);

	# (1) item embedding
	[mW_I, vW_I] = adam::init(W_I);
	[mb_I, vb_I] = adam::init(b_I);

	# (2) Dense 1
	[mW_D1, vW_D1] = adam::init(W_D1);
	[mb_D1, vb_D1] = adam::init(b_D1);

	# (3) Dense 2
	[mW_D2, vW_D2] = adam::init(W_D2);
	[mb_D2, vb_D2] = adam::init(b_D2);

	# (3) Dense 3
	[mW_D3, vW_D3] = adam::init(W_D3);
	[mb_D3, vb_D3] = adam::init(b_D3);

	# (N) final prediction
	[mW_F, vW_F] = adam::init(W_F);
	[mb_F, vb_F] = adam::init(b_F);

	# Optimize
	N = K_train;
	iters = ceil(N / batch_size);

	for (e in 1:epochs) {
	for (i in 1:iters) {
	# Get the next batch
	beg = ((i-1) * batch_size) %% N + 1;
	end = min(N, beg + batch_size - 1);

	items_batch = items_train[beg:end,];
	users_batch = users_train[beg:end,];
	y_batch = targets_train[beg:end,];

	# 3.Send inputs through layers and get activations
	[out_FA, out_F, out_D1A, out_D1, out_D2A, out_D2, out_D3A, out_D3, out_concat, out_U, out_I] =
	predict(users_batch, items_batch, biases, weights);

	# 4.compute final error gradients
	# params: (predictions, targets)
	dout = log_loss::backward(out_FA, y_batch);

	# Compute loss & accuracy for training & validation data in the last iteration
	if (i %% 100 == 1) {
	# Compute training loss & accuracy
	[loss, accuracy] = eval(out_FA, y_batch);

	# Compute validation loss & accuracy
	out_FA_val = predict(users_val, items_val, biases, weights);
	[loss_val, accuracy_val] = eval(out_FA_val, targets_val);

	# Output results
	print("Epoch: " + e + ", Iter: " + i + ", Train Loss: " + loss + ", Train Accuracy: "
	+ accuracy + ", Val Loss: " + loss_val + ", Val Accuracy: " + accuracy_val)
	}

	# 5.Backpropagation
	# params: (gradient from upstream, activation, weights, biases)
	dout_FA = sigmoid::backward(dout, out_F);
	[dout_F, dW_F, db_F] = affine::backward(dout_FA, out_D3A, W_F, b_F);

	dout_D3A = relu::backward(dout_F, out_D3);
	[dout_D3, dW_D3, db_D3] = affine::backward(dout_D3A, out_D2A, W_D3, b_D3);

	dout_D2A = relu::backward(dout_D3, out_D2);
	[dout_D2, dW_D2, db_D2] = affine::backward(dout_D2A, out_D1A, W_D2, b_D2);

	dout_D1A = relu::backward(dout_D2, out_D1);
	[dout_D1, dW_D1, db_D1] = affine::backward(dout_D1A, out_concat, W_D1, b_D1);

	# backprop concatenation: split the gradients up
	dout_U = dout_D1[,1:E];
	dout_I = dout_D1[,E+1:2*E];

	[dUsers, dW_U, db_U] = affine::backward(dout_U, users_batch, W_U, b_U);
	[dItems, dW_I, db_I] = affine::backward(dout_I, items_batch, W_I, b_I);

	# 6.update timestep
	t = e * i - 1;

	# 7.Call adam::update for all parameters
	[b_U, mb_U, vb_U] = adam::update(b_U, db_U, lr, beta1, beta2, epsilon, t, mb_U, vb_U);
	[W_U, mW_U, vW_U] = adam::update(W_U, dW_U, lr, beta1, beta2, epsilon, t, mW_U, vW_U);

	[b_I, mb_I, vb_I] = adam::update(b_I, db_I, lr, beta1, beta2, epsilon, t, mb_I, vb_I);
	[W_I, mW_I, vW_I] = adam::update(W_I, dW_I, lr, beta1, beta2, epsilon, t, mW_I, vW_I);

	[b_D1, mb_D1, vb_D1] = adam::update(b_D1, db_D1, lr, beta1, beta2, epsilon, t, mb_D1, vb_D1);
	[W_D1, mW_D1, vW_D1] = adam::update(W_D1, dW_D1, lr, beta1, beta2, epsilon, t, mW_D1, vW_D1);

	[b_D2, mb_D2, vb_D2] = adam::update(b_D2, db_D2, lr, beta1, beta2, epsilon, t, mb_D2, vb_D2);
	[W_D2, mW_D2, vW_D2] = adam::update(W_D2, dW_D2, lr, beta1, beta2, epsilon, t, mW_D2, vW_D2);

	[b_D3, mb_D3, vb_D3] = adam::update(b_D3, db_D3, lr, beta1, beta2, epsilon, t, mb_D3, vb_D3);
	[W_D3, mW_D3, vW_D3] = adam::update(W_D3, dW_D3, lr, beta1, beta2, epsilon, t, mW_D3, vW_D3);

	[b_F, mb_F, vb_F] = adam::update(b_F, db_F, lr, beta1, beta2, epsilon, t, mb_F, vb_F);
	[W_F, mW_F, vW_F] = adam::update(W_F, dW_F, lr, beta1, beta2, epsilon, t, mW_F, vW_F);

	# 8. Update lists
	biases = list(b_U, b_I, b_D1, b_D2, b_D3, b_F);
	weights = list(W_U, W_I, W_D1, W_D2, W_D3, W_F);
	}
	}

	print("NCF training completed after " + epochs + " epochs")
	}

	predict = function(matrix[double] users, matrix[double] items, List[unknown] biases, List[unknown] weights)
	return (matrix[double] out_FA, matrix[double] out_F,
	matrix[double] out_D1A, matrix[double] out_D1,
	matrix[double] out_D2A, matrix[double] out_D2,
	matrix[double] out_D3A, matrix[double] out_D3,
	matrix[double] out_concat, matrix[double] out_U, matrix[double] out_I) {
	#
	# Computes the predictions for the given inputs.
	#
	# Inputs:
	# - users : K user examples with E features, of shape (K, E).
	# - items : K item examples with E features, of shape (K, E).
	# - biases, weights : list of trained model parameters
	#
	# Outputs:
	# - out : target vector, y.
	#

	# parse parameters
	b_U = as.matrix(biases[1]);
	b_I = as.matrix(biases[2]);
	b_D1 = as.matrix(biases[3]);
	b_D2 = as.matrix(biases[4]);
	b_D3 = as.matrix(biases[5]);
	b_F = as.matrix(biases[6]);

	W_U = as.matrix(weights[1]);
	W_I = as.matrix(weights[2]);
	W_D1 = as.matrix(weights[3]);
	W_D2 = as.matrix(weights[4]);
	W_D3 = as.matrix(weights[5]);
	W_F = as.matrix(weights[6]);

	# send inputs through layers

	# (1) User and item embeddings + concatenation
	out_U = affine::forward(users, W_U, b_U);
	out_I = affine::forward(items, W_I, b_I);

	out_concat = cbind(out_U, out_I);

	# (2) Dense layers
	out_D1 = affine::forward(out_concat, W_D1, b_D1);
	out_D1A = relu::forward(out_D1); # separate "activation" for ReLU activation function

	out_D2 = affine::forward(out_D1A, W_D2, b_D2);
	out_D2A = relu::forward(out_D2); # separate "activation" for ReLU activation function

	out_D3 = affine::forward(out_D2A, W_D3, b_D3);
	out_D3A = relu::forward(out_D3); # separate "activation" for ReLU activation function

	# (N) final prediction
	out_F = affine::forward(out_D3A, W_F, b_F);
	out_FA = sigmoid::forward(out_F);
	}

	eval = function(matrix[double] probs, matrix[double] y)
	return (double loss, double accuracy) {
	/*
	* Computes loss and accuracy.
	*/

	# compute the log loss
	loss = log_loss::forward(probs, y);

	# compute accuracy
	Z = probs >= 0.5;
	accuracy = 1 - sum(abs(Z - y)) / nrow(y);
	}