* Factorization Machines for Regression.
# Imports
source("nn/optim/adam.dml") as adam
source("nn/layers/fm.dml") as fm
source("nn/layers/l2_loss.dml") as l2_loss
source("nn/layers/l2_reg.dml") as l2_reg
train = function(matrix[double] X, matrix[double] y, matrix[double] X_val, matrix[double] y_val, int epochs)
return (matrix[double] w0, matrix[double] W, matrix[double] V) {
* Trains the FM model.
* Inputs:
* - X : n examples with d features, of shape (n, d)
* - y : Target matrix, of shape (n, 1)
* - X_val : Input validation data matrix, of shape (n, d)
* - y_val : Target validation matrix, of shape (n, 1)
* Outputs:
* - w0, W, V : updated model parameters.
* Network Architecture:
* X --> [model] --> out --> l2_loss::backward(out, y) --> dout
n = nrow(X) # num examples
d = ncol(X) # num features
k = 2 # factorization dimensionality,
# only (=2) possible
# 1.initialize fm core
[w0, W, V] = fm::init(d, k);
# 2.initialize adam optimizer
## Default values for some parameters
lr = 0.001;
beta1 = 0.9; # [0, 1)
beta2 = 0.999; # [0, 1)
epsilon = 0.00000001;
t = 0;
[mw0, vw0] = adam::init(w0);
[mW, vW] = adam::init(W);
[mV, vV] = adam::init(V);
# regularization
lambda = 5e-04
# Optimize
print("Starting optimization")
batch_size = 10
N = n;
iters = ceil(N / batch_size)
for (e in 1:epochs) {
for (i in 1:iters) {
# Get the next batch
beg = ((i-1) * batch_size) %% N + 1
end = min(N, beg + batch_size - 1)
X_batch = X[beg:end,]
y_batch = y[beg:end,]
# 3.Send inputs through fm::forward
out = fm::forward(X_batch, w0, W, V);
# 4.compute gradients from a loss l2_loss::backward
dout = l2_loss::backward(out, y_batch)# (predictions, targets)
# Compute loss & accuracy for training & validation data every 100 iterations.
if (i %% 100 == 0) {
# Compute training loss & accuracy
[loss_data, accuracy] = eval(out, y_batch);
loss_reg_w0 = l2_reg::forward(w0, lambda)
loss_reg_W = l2_reg::forward(W , lambda)
loss_reg_V = l2_reg::forward(V , lambda)
loss = loss_data + loss_reg_w0 + loss_reg_W + loss_reg_V
# Compute validation loss & accuracy
probs_val = predict(X_val, w0, W, V)
[loss_val, accuracy_val] = eval(probs_val, y_val);
# Output results
print("Epoch: " + e + ", Iter: " + i + ", Train Loss: " + loss + ", Train Accuracy: "
+ accuracy + ", Val Loss: " + loss_val + ", Val Accuracy: " + accuracy_val)
# 5.Send the above result through fm::backward
[dw0, dW, dV] = fm::backward(dout, X_batch, w0, W, V);
# 6.update timestep
t = e * i - 1;
# 7.Call adam::update for all parameters
[w0,mw0,vw0] = adam::update(w0, dw0, lr, beta1, beta2, epsilon, t, mw0, vw0);
[W, mW, vW] = adam::update(W, dW, lr, beta1, beta2, epsilon, t, mW, vW );
[V, mV, vV] = adam::update(V, dV, lr, beta1, beta2, epsilon, t, mV, vV );
predict = function(matrix[double] X, matrix[double] w0, matrix[double] W, matrix[double] V)
return (matrix[double] out) {
* Computes the predictions for the given inputs.
* Inputs:
* - X : n examples with d features, of shape (n, d).
* - w0, W, V : trained model parameters.
* Outputs:
* - out : target vector, y.
# 1.Send inputs through fm::forward
out = fm::forward(X, w0, W, V);
eval = function(matrix[double] probs, matrix[double] y)
return (double loss, double accuracy) {
* Computes loss and accuracy.
# compute the log loss
loss = l2_loss::forward(probs, y);
# compute accuracy
sqr_mean = mean( (probs - y)^2 )
accuracy = (sqr_mean)^0.5