blob: 0719b585e2f7c0a4c1d44a76eab4c731d56beb07 [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
#
# Neural Collaborative Filtering
#
# Imports
source("nn/optim/adam.dml") as adam
source("nn/layers/relu.dml") as relu
source("nn/layers/sigmoid.dml") as sigmoid
source("nn/layers/affine.dml") as affine
source("nn/layers/log_loss.dml") as log_loss
source("nn/layers/l2_reg.dml") as l2_reg
train = function( matrix[double] users_train,
matrix[double] items_train,
matrix[double] targets_train,
matrix[double] users_val,
matrix[double] items_val,
matrix[double] targets_val,
integer epochs,
integer batch_size,
integer E,
integer D1,
integer D2,
integer D3)
return (List[unknown] biases, List[unknown] weights) {
# /*
# * Train NCF model
# *
# * Inputs:
# * - users_train: matrix of shape K_train × M with K_train samples of one-hot encoded users
# * - items_train: matrix of shape K_train × N with K_train samples of one-hot encoded items
# * - targets_train: vector with K_train entries containing either 0 or 1 indicating whether the user interacted with the item
# * - users_val: matrix of shape K_val × M with K_val samples of one-hot encoded users
# * - items_val: matrix of shape K_val × N with K_val samples of one-hot encoded items
# * - targets_val: vector with K_val entries containing either 0 or 1 indicating whether the user interacted with the item
# * - epochs: number of training epochs
# * - batch_size: size of the training batches
# * - E: dimension of embedding layers
# * - D1: dimension of dense layer 1
# * - D2: dimension of dense layer 2
# * - D3: dimension of dense layer 3
# *
# * Outputs:
# * - biases: list of biases
# * - weights: list o weights
# *
# * Network Architecture:
#
# +----------------------+ +----------------------+
# |User Embedding [users]| |Item Embedding [items]|
# +----------+-----------+ +---------+------------+
# | |
# | |
# | +-----------+ |
# +------>+Concatenate+<-----+
# +-----+-----+
# |
# v
# +----+----+
# | Dense 1 |
# | (ReLU) |
# +----+----+
# |
# v
# +----+----+
# | Dense 2 |
# | (ReLU) |
# +----+----+
# |
# v
# +----+----+
# | Dense 3 |
# | (ReLU) |
# +----+----+
# |
# v
# +-----+-----+
# |Prediction |
# |(Sigmoid) |
# +-----------+
# *
# */
# sanity checks
assert(nrow(items_train) == nrow(users_train));
assert(nrow(users_train) == nrow(targets_train));
assert(nrow(items_val) == nrow(users_val));
assert(nrow(users_val) == nrow(targets_val));
assert(ncol(items_val) == ncol(items_train));
assert(ncol(users_val) == ncol(users_train));
assert(ncol(targets_val) == ncol(targets_train));
assert(ncol(targets_train) == 1);
K_train = nrow(targets_train); # number of training samples
K_val = nrow(targets_val); # number of validation samples
N = ncol(items_train); # number items
M = ncol(users_train); # number users
print("NCF training starting with "
+ K_train + " training samples, "
+ K_val + " validation samples, "
+ N + " items and "
+ M + " users...");
# 1.initialize layers
[W_U, b_U] = affine::init(M, E); # user embedding
[W_I, b_I] = affine::init(N, E); # item embedding
[W_D1, b_D1] = affine::init(2 * E, D1); # dense layer 1
[W_D2, b_D2] = affine::init(D1, D2); # dense layer 2
[W_D3, b_D3] = affine::init(D2, D3); # dense layer 3
[W_F, b_F] = affine::init(D3, 1); # final prediction
# initialize bias and weight lists
biases = list(b_U, b_I, b_D1, b_D2, b_D3, b_F);
weights = list(W_U, W_I, W_D1, W_D2, W_D3, W_F);
# 2.initialize adam optimizer
## Default values for some parameters
lr = 0.001;
beta1 = 0.9; # [0, 1)
beta2 = 0.999; # [0, 1)
epsilon = 0.0000001;
t = 0;
# (1) user embedding
[mW_U, vW_U] = adam::init(W_U);
[mb_U, vb_U] = adam::init(b_U);
# (1) item embedding
[mW_I, vW_I] = adam::init(W_I);
[mb_I, vb_I] = adam::init(b_I);
# (2) Dense 1
[mW_D1, vW_D1] = adam::init(W_D1);
[mb_D1, vb_D1] = adam::init(b_D1);
# (3) Dense 2
[mW_D2, vW_D2] = adam::init(W_D2);
[mb_D2, vb_D2] = adam::init(b_D2);
# (3) Dense 3
[mW_D3, vW_D3] = adam::init(W_D3);
[mb_D3, vb_D3] = adam::init(b_D3);
# (N) final prediction
[mW_F, vW_F] = adam::init(W_F);
[mb_F, vb_F] = adam::init(b_F);
# Optimize
N = K_train;
iters = ceil(N / batch_size);
for (e in 1:epochs) {
for (i in 1:iters) {
# Get the next batch
beg = ((i-1) * batch_size) %% N + 1;
end = min(N, beg + batch_size - 1);
items_batch = items_train[beg:end,];
users_batch = users_train[beg:end,];
y_batch = targets_train[beg:end,];
# 3.Send inputs through layers and get activations
[out_FA, out_F, out_D1A, out_D1, out_D2A, out_D2, out_D3A, out_D3, out_concat, out_U, out_I] =
predict(users_batch, items_batch, biases, weights);
# 4.compute final error gradients
# params: (predictions, targets)
dout = log_loss::backward(out_FA, y_batch);
# Compute loss & accuracy for training & validation data in the last iteration
if (i %% 100 == 1) {
# Compute training loss & accuracy
[loss, accuracy] = eval(out_FA, y_batch);
# Compute validation loss & accuracy
out_FA_val = predict(users_val, items_val, biases, weights);
[loss_val, accuracy_val] = eval(out_FA_val, targets_val);
# Output results
print("Epoch: " + e + ", Iter: " + i + ", Train Loss: " + loss + ", Train Accuracy: "
+ accuracy + ", Val Loss: " + loss_val + ", Val Accuracy: " + accuracy_val)
}
# 5.Backpropagation
# params: (gradient from upstream, activation, weights, biases)
dout_FA = sigmoid::backward(dout, out_F);
[dout_F, dW_F, db_F] = affine::backward(dout_FA, out_D3A, W_F, b_F);
dout_D3A = relu::backward(dout_F, out_D3);
[dout_D3, dW_D3, db_D3] = affine::backward(dout_D3A, out_D2A, W_D3, b_D3);
dout_D2A = relu::backward(dout_D3, out_D2);
[dout_D2, dW_D2, db_D2] = affine::backward(dout_D2A, out_D1A, W_D2, b_D2);
dout_D1A = relu::backward(dout_D2, out_D1);
[dout_D1, dW_D1, db_D1] = affine::backward(dout_D1A, out_concat, W_D1, b_D1);
# backprop concatenation: split the gradients up
dout_U = dout_D1[,1:E];
dout_I = dout_D1[,E+1:2*E];
[dUsers, dW_U, db_U] = affine::backward(dout_U, users_batch, W_U, b_U);
[dItems, dW_I, db_I] = affine::backward(dout_I, items_batch, W_I, b_I);
# 6.update timestep
t = e * i - 1;
# 7.Call adam::update for all parameters
[b_U, mb_U, vb_U] = adam::update(b_U, db_U, lr, beta1, beta2, epsilon, t, mb_U, vb_U);
[W_U, mW_U, vW_U] = adam::update(W_U, dW_U, lr, beta1, beta2, epsilon, t, mW_U, vW_U);
[b_I, mb_I, vb_I] = adam::update(b_I, db_I, lr, beta1, beta2, epsilon, t, mb_I, vb_I);
[W_I, mW_I, vW_I] = adam::update(W_I, dW_I, lr, beta1, beta2, epsilon, t, mW_I, vW_I);
[b_D1, mb_D1, vb_D1] = adam::update(b_D1, db_D1, lr, beta1, beta2, epsilon, t, mb_D1, vb_D1);
[W_D1, mW_D1, vW_D1] = adam::update(W_D1, dW_D1, lr, beta1, beta2, epsilon, t, mW_D1, vW_D1);
[b_D2, mb_D2, vb_D2] = adam::update(b_D2, db_D2, lr, beta1, beta2, epsilon, t, mb_D2, vb_D2);
[W_D2, mW_D2, vW_D2] = adam::update(W_D2, dW_D2, lr, beta1, beta2, epsilon, t, mW_D2, vW_D2);
[b_D3, mb_D3, vb_D3] = adam::update(b_D3, db_D3, lr, beta1, beta2, epsilon, t, mb_D3, vb_D3);
[W_D3, mW_D3, vW_D3] = adam::update(W_D3, dW_D3, lr, beta1, beta2, epsilon, t, mW_D3, vW_D3);
[b_F, mb_F, vb_F] = adam::update(b_F, db_F, lr, beta1, beta2, epsilon, t, mb_F, vb_F);
[W_F, mW_F, vW_F] = adam::update(W_F, dW_F, lr, beta1, beta2, epsilon, t, mW_F, vW_F);
# 8. Update lists
biases = list(b_U, b_I, b_D1, b_D2, b_D3, b_F);
weights = list(W_U, W_I, W_D1, W_D2, W_D3, W_F);
}
}
print("NCF training completed after " + epochs + " epochs")
}
predict = function(matrix[double] users, matrix[double] items, List[unknown] biases, List[unknown] weights)
return (matrix[double] out_FA, matrix[double] out_F,
matrix[double] out_D1A, matrix[double] out_D1,
matrix[double] out_D2A, matrix[double] out_D2,
matrix[double] out_D3A, matrix[double] out_D3,
matrix[double] out_concat, matrix[double] out_U, matrix[double] out_I) {
#
# Computes the predictions for the given inputs.
#
# Inputs:
# - users : K user examples with E features, of shape (K, E).
# - items : K item examples with E features, of shape (K, E).
# - biases, weights : list of trained model parameters
#
# Outputs:
# - out : target vector, y.
#
# parse parameters
b_U = as.matrix(biases[1]);
b_I = as.matrix(biases[2]);
b_D1 = as.matrix(biases[3]);
b_D2 = as.matrix(biases[4]);
b_D3 = as.matrix(biases[5]);
b_F = as.matrix(biases[6]);
W_U = as.matrix(weights[1]);
W_I = as.matrix(weights[2]);
W_D1 = as.matrix(weights[3]);
W_D2 = as.matrix(weights[4]);
W_D3 = as.matrix(weights[5]);
W_F = as.matrix(weights[6]);
# send inputs through layers
# (1) User and item embeddings + concatenation
out_U = affine::forward(users, W_U, b_U);
out_I = affine::forward(items, W_I, b_I);
out_concat = cbind(out_U, out_I);
# (2) Dense layers
out_D1 = affine::forward(out_concat, W_D1, b_D1);
out_D1A = relu::forward(out_D1); # separate "activation" for ReLU activation function
out_D2 = affine::forward(out_D1A, W_D2, b_D2);
out_D2A = relu::forward(out_D2); # separate "activation" for ReLU activation function
out_D3 = affine::forward(out_D2A, W_D3, b_D3);
out_D3A = relu::forward(out_D3); # separate "activation" for ReLU activation function
# (N) final prediction
out_F = affine::forward(out_D3A, W_F, b_F);
out_FA = sigmoid::forward(out_F);
}
eval = function(matrix[double] probs, matrix[double] y)
return (double loss, double accuracy) {
/*
* Computes loss and accuracy.
*/
# compute the log loss
loss = log_loss::forward(probs, y);
# compute accuracy
Z = probs >= 0.5;
accuracy = 1 - sum(abs(Z - y)) / nrow(y);
}