| #------------------------------------------------------------- |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| #------------------------------------------------------------- |
| |
| # |
| # ALS algorithm using a direct solve method for individual least squares problems. This script |
| # computes an approximate factorization of a low-rank matrix V into two matrices L and R. |
| # Matrices L and R are computed by minimizing a loss function (with regularization). |
| # |
| # INPUT PARAMETERS: |
| # --------------------------------------------------------------------------------------------- |
| # NAME TYPE DEFAULT MEANING |
| # --------------------------------------------------------------------------------------------- |
| # V String --- Location to read the input matrix V to be factorized |
| # L String --- Location to write the factor matrix L |
| # R String --- Location to write the factor matrix R |
| # rank Int 10 Rank of the factorization |
| # reg String "L2" Regularization: |
| # "L2" = L2 regularization; |
| # "wL2" = weighted L2 regularization |
| # lambda Double 0.000001 Regularization parameter, no regularization if 0.0 |
| # maxi Int 50 Maximum number of iterations |
| # check Boolean FALSE Check for convergence after every iteration, i.e., updating L and R once |
| # thr Double 0.0001 Assuming check is set to TRUE, the algorithm stops and convergence is declared |
| # if the decrease in loss in any two consecutive iterations falls below this threshold; |
| # if check is FALSE thr is ignored |
| # fmt String "text" The output format of the factor matrices L and R, such as "text" or "csv" |
| # --------------------------------------------------------------------------------------------- |
| # OUTPUT: |
| # 1- An m x r matrix L, where r is the factorization rank |
| # 2- An r x n matrix R |
| # |
| # HOW TO INVOKE THIS SCRIPT - EXAMPLE: |
| # hadoop jar SystemDS.jar -f ALS-DS.dml -nvargs V=INPUT_DIR/V L=OUTPUT_DIR/L R=OUTPUT_DIR/R rank=10 reg="L2" lambda=0.0001 fmt=csv |
| |
| fileV = $V; |
| fileL = $L; |
| fileR = $R; |
| |
| # Default values of some parameters |
| r = ifdef($rank, 10); |
| reg = ifdef($reg, "L2"); |
| lambda = ifdef($lambda, 0.000001); |
| max_iter = ifdef($maxi, 50); |
| check = ifdef($check, FALSE); |
| thr = ifdef($thr, 0.0001); |
| fmtO = ifdef($fmt, "text"); |
| |
| V = read (fileV); |
| |
| |
| # check the input matrix V, if some rows or columns contain only zeros remove them from V |
| V_nonzero_ind = V != 0; |
| row_nonzeros = rowSums (V_nonzero_ind); |
| col_nonzeros = t (colSums (V_nonzero_ind)); |
| orig_nonzero_rows_ind = row_nonzeros != 0; |
| orig_nonzero_cols_ind = col_nonzeros != 0; |
| num_zero_rows = nrow (V) - sum (orig_nonzero_rows_ind); |
| num_zero_cols = ncol (V) - sum (orig_nonzero_cols_ind); |
| if (num_zero_rows > 0) { |
| print ("Matrix V contains empty rows! These rows will be removed."); |
| V = removeEmpty (target = V, margin = "rows"); |
| } |
| if (num_zero_cols > 0) { |
| print ("Matrix V contains empty columns! These columns will be removed."); |
| V = removeEmpty (target = V, margin = "cols"); |
| } |
| if (num_zero_rows > 0 | num_zero_cols > 0) { |
| print ("Recomputing nonzero rows and columns!"); |
| V_nonzero_ind = V != 0; |
| row_nonzeros = rowSums (V_nonzero_ind); |
| col_nonzeros = t (colSums (V_nonzero_ind)); |
| } |
| |
| ###### MAIN PART ###### |
| m = nrow (V); |
| n = ncol (V); |
| |
| # initializing factor matrices |
| L = rand (rows = m, cols = r, min = -0.5, max = 0.5); |
| R = rand (rows = n, cols = r, min = -0.5, max = 0.5); |
| |
| # initializing transformed matrices |
| Vt = t(V); |
| |
| # check for regularization |
| if (reg == "L2") { |
| print ("BEGIN ALS SCRIPT WITH NONZERO SQUARED LOSS + L2 WITH LAMBDA - " + lambda); |
| } else if (reg == "wL2") { |
| print ("BEGIN ALS SCRIPT WITH NONZERO SQUARED LOSS + WEIGHTED L2 WITH LAMBDA - " + lambda); |
| } else { |
| stop ("wrong regularization! " + reg); |
| } |
| |
| loss_init = 0.0; # only used if check is TRUE |
| if (check) { |
| loss_init = sum (V_nonzero_ind * (V - (L %*% t(R)))^2) + lambda * (sum ((L^2) * row_nonzeros) + sum ((R^2) * col_nonzeros)); |
| print ("----- Initial train loss: " + loss_init + " -----"); |
| } |
| |
| lambda_I = diag (matrix (lambda, rows = r, cols = 1)); |
| it = 0; |
| converged = FALSE; |
| while ((it < max_iter) & (!converged)) { |
| it = it + 1; |
| # keep R fixed and update L |
| parfor (i in 1:m) { |
| R_nonzero_ind = t(V[i,] != 0); |
| R_nonzero = removeEmpty (target=R * R_nonzero_ind, margin="rows"); |
| A1 = (t(R_nonzero) %*% R_nonzero) + (as.scalar(row_nonzeros[i,1]) * lambda_I); # coefficient matrix |
| L[i,] = t(solve (A1, t(V[i,] %*% R))); |
| } |
| |
| # keep L fixed and update R |
| parfor (j in 1:n) { |
| L_nonzero_ind = t(Vt[j,] != 0) |
| L_nonzero = removeEmpty (target=L * L_nonzero_ind, margin="rows"); |
| A2 = (t(L_nonzero) %*% L_nonzero) + (as.scalar(col_nonzeros[j,1]) * lambda_I); # coefficient matrix |
| R[j,] = t(solve (A2, t(Vt[j,] %*% L))); |
| } |
| |
| # check for convergence |
| if (check) { |
| loss_cur = sum (V_nonzero_ind * (V - (L %*% t(R)))^2) + lambda * (sum ((L^2) * row_nonzeros) + sum ((R^2) * col_nonzeros)); |
| loss_dec = (loss_init - loss_cur) / loss_init; |
| print ("Train loss at iteration (R) " + it + ": " + loss_cur + " loss-dec " + loss_dec); |
| if (loss_dec >= 0 & loss_dec < thr | loss_init == 0) { |
| print ("----- ALS converged after " + it + " iterations!"); |
| converged = TRUE; |
| } |
| loss_init = loss_cur; |
| } |
| } # end of while loop |
| |
| if (check) { |
| print ("----- Final train loss: " + loss_init + " -----"); |
| } |
| |
| if (!converged) { |
| print ("Max iteration achieved but not converged!"); |
| } |
| |
| # inject 0s in L if original V had empty rows |
| if (num_zero_rows > 0) { |
| L = removeEmpty (target = diag (orig_nonzero_rows_ind), margin = "cols") %*% L; |
| } |
| # inject 0s in R if original V had empty rows |
| if (num_zero_cols > 0) { |
| R = removeEmpty (target = diag (orig_nonzero_cols_ind), margin = "cols") %*% R; |
| } |
| Rt = t (R); |
| write (L, fileL, format=fmtO); |
| write (Rt, fileR, format=fmtO); |
| |