| #------------------------------------------------------------- |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| #------------------------------------------------------------- |
| |
| # |
| # THIS SCRIPT COMPUTES AN APPROXIMATE FACTORIZATION OF A LOW-RANK MATRIX X INTO TWO MATRICES U AND V |
| # USING THE ALTERNATING-LEAST-SQUARES (ALS) ALGORITHM WITH CONJUGATE GRADIENT. |
| # MATRICES U AND V ARE COMPUTED BY MINIMIZING A LOSS FUNCTION (WITH REGULARIZATION). |
| # |
| # INPUT PARAMETERS: |
| # --------------------------------------------------------------------------------------------- |
| # NAME TYPE DEFAULT MEANING |
| # --------------------------------------------------------------------------------------------- |
| # X String --- Location to read the input matrix X to be factorized |
| # U String --- Location to write the factor matrix U |
| # V String --- Location to write the factor matrix V |
| # rank Int 10 Rank of the factorization |
| # reg String "L2" Regularization: |
| # "L2" = L2 regularization; |
| # "wL2" = weighted L2 regularization |
| # lambda Double 0.000001 Regularization parameter, no regularization if 0.0 |
| # maxi Int 50 Maximum number of iterations |
| # check Boolean TRUE Check for convergence after every iteration, i.e., updating U and V once |
| # thr Double 0.0001 Assuming check is set to TRUE, the algorithm stops and convergence is declared |
| # if the decrease in loss in any two consecutive iterations falls below this threshold; |
| # if check is FALSE thr is ignored |
| # fmt String "text" The output format of the factor matrices L and R, such as "text" or "csv" |
| # --------------------------------------------------------------------------------------------- |
| # OUTPUT: |
| # 1- An m x r matrix U, where r is the factorization rank |
| # 2- An r x n matrix V |
| # |
| # HOW TO INVOKE THIS SCRIPT - EXAMPLE: |
| # hadoop jar SystemDS.jar -f ALS-CG.dml -nvargs X=INPUT_DIR/X U=OUTPUT_DIR/U V=OUTPUT_DIR/V rank=10 reg="L2" lambda=0.0001 fmt=csv |
| |
| fileX = $X; |
| fileU = $U; |
| fileV = $V; |
| |
| # Default values of some parameters |
| r = ifdef($rank, 10); |
| reg = ifdef($reg, "L2"); |
| lambda = ifdef($lambda, 0.000001); |
| max_iter = ifdef($maxi, 50); |
| check = ifdef($check, TRUE); |
| thr = ifdef($thr, 0.0001); |
| fmtO = ifdef($fmt, "text"); |
| |
| |
| ###### MAIN PART ###### |
| X = read (fileX); |
| m = nrow (X); |
| n = ncol (X); |
| |
| # initializing factor matrices |
| U = rand (rows = m, cols = r, min = -0.5, max = 0.5); # mxr |
| V = rand (rows = n, cols = r, min = -0.5, max = 0.5); # nxr |
| |
| W = (X != 0); |
| |
| # check for regularization |
| row_nonzeros = matrix(0,rows=1,cols=1); |
| col_nonzeros = matrix(0,rows=1,cols=1); |
| if( reg == "L2" ) { |
| print ("BEGIN ALS-CG SCRIPT WITH NONZERO SQUARED LOSS + L2 WITH LAMBDA - " + lambda); |
| row_nonzeros = matrix(1, nrow(W), 1); |
| col_nonzeros = matrix(1, ncol(W), 1); |
| } |
| else if( reg == "wL2" ) { |
| print ("BEGIN ALS-CG SCRIPT WITH NONZERO SQUARED LOSS + WEIGHTED L2 WITH LAMBDA - " + lambda); |
| row_nonzeros = rowSums(W); |
| col_nonzeros = t(colSums(W)); |
| } |
| else { |
| stop ("wrong regularization! " + reg); |
| } |
| |
| # Loss Function with L2: |
| # f (U, V) = 0.5 * sum (W * (U %*% V - X) ^ 2) |
| # + 0.5 * lambda * (sum (U ^ 2) + sum (V ^ 2)) |
| # Loss Function with weighted L2: |
| # f (U, V) = 0.5 * sum (W * (U %*% V - X) ^ 2) |
| # + 0.5 * lambda * (sum (U ^ 2 * row_nonzeros) + sum (V ^ 2 * col_nonzeros)) |
| |
| is_U = TRUE; # TRUE = Optimize U, FALSE = Optimize V |
| maxinneriter = r ; # min (ncol (U), 15); |
| |
| loss_init = 0.0; # only used if check is TRUE |
| if( check ) { |
| loss_init = 0.5 * sum( (X != 0) * (U %*% t(V) - X) ^ 2); |
| loss_init = loss_init + 0.5 * lambda * (sum (U ^ 2 * row_nonzeros) + sum (V ^ 2 * col_nonzeros)); |
| print ("----- Initial train loss: " + loss_init + " -----"); |
| } |
| |
| it = 0; |
| converged = FALSE; |
| while( as.integer(it/2) < max_iter & ! converged ) |
| { |
| it = it + 1; |
| if( is_U ) { |
| G = ((X != 0) * (U %*% t(V) - X)) %*% V + lambda * U * row_nonzeros; |
| } |
| else { |
| G = t(t(U) %*% ((X != 0) * (U %*% t(V) - X))) + lambda * V * col_nonzeros; |
| } |
| |
| R = -G; |
| S = R; |
| norm_G2 = sum (G ^ 2); |
| norm_R2 = norm_G2; |
| |
| inneriter = 1; |
| tt = 0.000000001; |
| while( norm_R2 > tt * norm_G2 & inneriter <= maxinneriter ) |
| { |
| if( is_U ) { |
| HS = (W * (S %*% t(V))) %*% V + lambda * S * row_nonzeros; |
| alpha = norm_R2 / sum (S * HS); |
| U = U + alpha * S; # OK since U is not used in HS |
| } |
| else { |
| HS = t(t(U) %*% (W * (U %*% t(S)))) + lambda * S * col_nonzeros; |
| alpha = norm_R2 / sum (S * HS); |
| V = V + alpha * S; # OK since V is not used in HS |
| } |
| |
| R = R - alpha * HS; |
| old_norm_R2 = norm_R2; |
| norm_R2 = sum (R ^ 2); |
| S = R + (norm_R2 / old_norm_R2) * S; |
| inneriter = inneriter + 1; |
| } |
| |
| is_U = ! is_U; |
| |
| # check for convergence |
| if( check & (it%%2 == 0) ) { |
| loss_cur = 0.5 * sum( (X != 0) * (U %*% t(V) - X) ^ 2); |
| loss_cur = loss_cur + 0.5 * lambda * (sum (U ^ 2 * row_nonzeros) + sum (V ^ 2 * col_nonzeros)); |
| |
| loss_dec = (loss_init - loss_cur) / loss_init; |
| print ("Train loss at iteration (" + as.integer(it/2) + "): " + loss_cur + " loss-dec " + loss_dec); |
| if( loss_dec >= 0 & loss_dec < thr | loss_init == 0 ) { |
| print ("----- ALS-CG converged after " + as.integer(it/2) + " iterations!"); |
| converged = TRUE; |
| } |
| loss_init = loss_cur; |
| } |
| } |
| |
| if( check ) { |
| print ("----- Final train loss: " + loss_init + " -----"); |
| } |
| |
| if( !converged ) { |
| print ("Max iteration achieved but not converged!"); |
| } |
| |
| V = t(V); |
| write (U, fileU, format=fmtO); |
| write (V, fileV, format=fmtO); |