blob: e5c85fc07006f53274282388f88e52aa8fe9df79 [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
#
# THIS SCRIPT COMPUTES AN APPROXIMATE FACTORIZATIONOF A LOW-RANK MATRIX X INTO TWO MATRICES U AND V
# USING ALTERNATING-LEAST-SQUARES (ALS) ALGORITHM WITH CONJUGATE GRADIENT
# MATRICES U AND V ARE COMPUTED BY MINIMIZING A LOSS FUNCTION (WITH REGULARIZATION)
#
# INPUT PARAMETERS:
# ---------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# ---------------------------------------------------------------------------------------------
# X String --- Location to read the input matrix X to be factorized
# U String --- Location to write the factor matrix U
# V String --- Location to write the factor matrix V
# rank Int 10 Rank of the factorization
# reg String "L2" Regularization:
# "L2" = L2 regularization;
# "wL2" = weighted L2 regularization
# lambda Double 0.000001 Regularization parameter, no regularization if 0.0
# maxi Int 50 Maximum number of iterations
# check Boolean FALSE Check for convergence after every iteration, i.e., updating U and V once
# thr Double 0.0001 Assuming check is set to TRUE, the algorithm stops and convergence is declared
# if the decrease in loss in any two consecutive iterations falls below this threshold;
# if check is FALSE thr is ignored
# fmt String "text" The output format of the factor matrices L and R, such as "text" or "csv"
# ---------------------------------------------------------------------------------------------
# OUTPUT:
# 1- An m x r matrix U, where r is the factorization rank
# 2- An r x n matrix V
#
# HOW TO INVOKE THIS SCRIPT - EXAMPLE:
# hadoop jar SystemML.jar -f ALS-CG.dml -nvargs X=INPUT_DIR/X U=OUTPUT_DIR/U V=OUTPUT_DIR/V rank=10 reg="L2" lambda=0.0001 fmt=csv
fileX = $X;
fileU = $U;
fileV = $V;
# Default values of some parameters
r = ifdef ($rank, 10); # $rank=10;
reg = ifdef ($reg, "L2") # $reg="L2";
lambda = ifdef ($lambda, 0.000001); # $lambda=0.000001;
max_iter = ifdef ($maxi, 50); # $maxi=50;
check = ifdef ($check, TRUE); # $check=FALSE;
thr = ifdef ($thr, 0.0001); # $thr=0.0001;
fmtO = ifdef ($fmt, "text"); # $fmt="text";
###### MAIN PART ######
X = read (fileX);
m = nrow (X);
n = ncol (X);
# initializing factor matrices
U = rand (rows = m, cols = r, min = -0.5, max = 0.5); # mxr
V = rand (rows = n, cols = r, min = -0.5, max = 0.5); # nxr
W = (X != 0);
# check for regularization
if( reg == "L2" ) {
print ("BEGIN ALS-CG SCRIPT WITH NONZERO SQUARED LOSS + L2 WITH LAMBDA - " + lambda);
row_nonzeros = matrix(1, nrow(W), 1);
col_nonzeros = matrix(1, ncol(W), 1);
}
else if( reg == "wL2" ) {
print ("BEGIN ALS-CG SCRIPT WITH NONZERO SQUARED LOSS + WEIGHTED L2 WITH LAMBDA - " + lambda);
row_nonzeros = rowSums(W);
col_nonzeros = t(colSums(W));
}
else {
stop ("wrong regularization! " + reg);
}
# Loss Function with L2:
# f (U, V) = 0.5 * sum (W * (U %*% V - X) ^ 2)
# + 0.5 * lambda * (sum (U ^ 2) + sum (V ^ 2))
# Loss Function with weighted L2:
# f (U, V) = 0.5 * sum (W * (U %*% V - X) ^ 2)
# + 0.5 * lambda * (sum (U ^ 2 * row_nonzeros) + sum (V ^ 2 * col_nonzeros))
is_U = TRUE; # TRUE = Optimize U, FALSE = Optimize V
maxinneriter = r ; # min (ncol (U), 15);
if( check ) {
loss_init = 0.5 * sum( (X != 0) * (U %*% t(V) - X) ^ 2);
loss_init = loss_init + 0.5 * lambda * (sum (U ^ 2 * row_nonzeros) + sum (V ^ 2 * col_nonzeros));
print ("----- Initial train loss: " + loss_init + " -----");
}
it = 0;
converged = FALSE;
while( as.integer(it/2) < max_iter & ! converged )
{
it = it + 1;
if( is_U ) {
G = ((X != 0) * (U %*% t(V) - X)) %*% V + lambda * U * row_nonzeros;
}
else {
G = t(t(U) %*% ((X != 0) * (U %*% t(V) - X))) + lambda * V * col_nonzeros;
}
R = -G;
S = R;
norm_G2 = sum (G ^ 2);
norm_R2 = norm_G2;
inneriter = 1;
tt = 0.000000001;
while( norm_R2 > tt * norm_G2 & inneriter <= maxinneriter )
{
if( is_U ) {
HS = (W * (S %*% t(V))) %*% V + lambda * S * row_nonzeros;
alpha = norm_R2 / sum (S * HS);
U = U + alpha * S; # OK since U is not used in HS
}
else {
HS = t(t(U) %*% (W * (U %*% t(S)))) + lambda * S * col_nonzeros;
alpha = norm_R2 / sum (S * HS);
V = V + alpha * S; # OK since V is not used in HS
}
R = R - alpha * HS;
old_norm_R2 = norm_R2;
norm_R2 = sum (R ^ 2);
S = R + (norm_R2 / old_norm_R2) * S;
inneriter = inneriter + 1;
}
is_U = ! is_U;
# check for convergence
if( check & (it%%2 == 0) ) {
loss_cur = 0.5 * sum( (X != 0) * (U %*% t(V) - X) ^ 2);
loss_cur = loss_cur + 0.5 * lambda * (sum (U ^ 2 * row_nonzeros) + sum (V ^ 2 * col_nonzeros));
loss_dec = (loss_init - loss_cur) / loss_init;
print ("Train loss at iteration (" + as.integer(it/2) + "): " + loss_cur + " loss-dec " + loss_dec);
if( loss_dec >= 0 & loss_dec < thr | loss_init == 0 ) {
print ("----- ALS-CG converged after " + as.integer(it/2) + " iterations!");
converged = TRUE;
}
loss_init = loss_cur;
}
}
if( check ) {
print ("----- Final train loss: " + loss_init + " -----");
}
if( !converged ) {
print ("Max iteration achieved but not converged!");
}
V = t(V);
write (U, fileU, format=fmtO);
write (V, fileV, format=fmtO);