scripts/algorithms/ALS-DS.dml - systemds - Git at Google

 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 #-------------------------------------------------------------

 #
 # THIS SCRIPT COMPUTES AN APPROXIMATE FACTORIZATIONOF A LOW-RANK MATRIX V INTO TWO MATRICES L AND R
 # USING ALTERNATING-LEAST-SQUARES (ALS) ALGORITHM
 # MATRICES L AND R ARE COMPUTED BY MINIMIZING A LOSS FUNCTION (WITH REGULARIZATION)
 #
 # INPUT   PARAMETERS:
 # ---------------------------------------------------------------------------------------------
 # NAME    TYPE     DEFAULT  MEANING
 # ---------------------------------------------------------------------------------------------
 # V       String   ---      Location to read the input matrix V to be factorized
 # L       String   ---      Location to write the factor matrix L
 # R       String   ---      Location to write the factor matrix R
 # rank    Int      10       Rank of the factorization
 # reg     String   "L2"	    Regularization:
 #						    "L2" = L2 regularization;
 #                           "wL2" = weighted L2 regularization
 # lambda  Double   0.0      Regularization parameter, no regularization if 0.0
 # maxi    Int      50       Maximum number of iterations
 # check   Boolean  FALSE    Check for convergence after every iteration, i.e., updating L and R once
 # thr     Double   0.0001   Assuming check is set to TRUE, the algorithm stops and convergence is declared
 # 							if the decrease in loss in any two consecutive iterations falls below this threshold;
 #							if check is FALSE thr is ignored
 # fmt     String   "text"   The output format of the factor matrices L and R, such as "text" or "csv"
 # ---------------------------------------------------------------------------------------------
 # OUTPUT:
 # 1- An m x r matrix L, where r is the factorization rank
 # 2- An r x n matrix R
 #
 # HOW TO INVOKE THIS SCRIPT - EXAMPLE:
 # hadoop jar SystemML.jar -f ALS.dml -nvargs V=INPUT_DIR/V L=OUTPUT_DIR/L R=OUTPUT_DIR/R rank=10 reg="L2" lambda=0.0001 fmt=csv

 fileV      = $V;
 fileL	   = $L;
 fileR      = $R;

 # Default values of some parameters
 r          = ifdef ($rank, 10);	        # $rank=10;
 reg	   	   = ifdef ($reg, "L2")         # $reg="L2";
 lambda	   = ifdef ($lambda, 0.000001); # $lambda=0.000001;
 max_iter   = ifdef ($maxi, 50);         # $maxi=50;
 check      = ifdef ($check, FALSE);	    # $check=FALSE;
 thr        = ifdef ($thr, 0.0001);      # $thr=0.0001;
 fmtO       = ifdef ($fmt, "text");      # $fmt="text";

 V = read (fileV);


 # check the input matrix V, if some rows or columns contain only zeros remove them from V
 V_nonzero_ind = V != 0;
 row_nonzeros = rowSums (V_nonzero_ind);
 col_nonzeros = t (colSums (V_nonzero_ind));
 orig_nonzero_rows_ind = row_nonzeros != 0;
 orig_nonzero_cols_ind = col_nonzeros != 0;
 num_zero_rows = nrow (V) - sum (orig_nonzero_rows_ind);
 num_zero_cols = ncol (V) - sum (orig_nonzero_cols_ind);
 if (num_zero_rows > 0) {
 	print ("Matrix V contains empty rows! These rows will be removed.");
 	V = removeEmpty (target = V, margin = "rows");
 }
 if (num_zero_cols > 0) {
 	print ("Matrix V contains empty columns! These columns will be removed.");
 	V = removeEmpty (target = V, margin = "cols");
 }
 if (num_zero_rows > 0 | num_zero_cols > 0) {
 	print ("Recomputing nonzero rows and columns!");
 	V_nonzero_ind = V != 0;
 	row_nonzeros = rowSums (V_nonzero_ind);
 	col_nonzeros = t (colSums (V_nonzero_ind));
 }

 ###### MAIN PART ######
 m = nrow (V);
 n = ncol (V);

 # initializing factor matrices
 L = rand (rows = m, cols = r, min = -0.5, max = 0.5);
 R = rand (rows = n, cols = r, min = -0.5, max = 0.5);

 # initializing transformed matrices
 Vt = t(V);

 # check for regularization
 if (reg == "L2") {
 	print ("BEGIN ALS SCRIPT WITH NONZERO SQUARED LOSS + L2 WITH LAMBDA - " + lambda);
 } else if (reg == "wL2") {
 	print ("BEGIN ALS SCRIPT WITH NONZERO SQUARED LOSS + WEIGHTED L2 WITH LAMBDA - " + lambda);
 } else {
 	stop ("wrong regularization! " + reg);
 }

 if (check) {
 	loss_init = sum (V_nonzero_ind * (V - (L %*% t(R)))^2) + lambda * (sum ((L^2) * row_nonzeros) + sum ((R^2) * col_nonzeros));
 	print ("-----   Initial train loss: " + loss_init + " -----");
 }

 lambda_I = diag (matrix (lambda, rows = r, cols = 1));
 it = 0;
 converged = FALSE;
 while ((it < max_iter) & (!converged)) {
 	it = it + 1;
 	# keep R fixed and update L
 	parfor (i in 1:m) {
     	R_nonzero_ind = t(V[i,] != 0);
 		R_nonzero = removeEmpty (target=R * R_nonzero_ind, margin="rows");
 		A1 = (t(R_nonzero) %*% R_nonzero) + (as.scalar(row_nonzeros[i,1]) * lambda_I); # coefficient matrix
 		L[i,] = t(solve (A1, t(V[i,] %*% R)));
 	}

 	# keep L fixed and update R
 	parfor (j in 1:n) {
 		L_nonzero_ind = t(Vt[j,] != 0)
 		L_nonzero = removeEmpty (target=L * L_nonzero_ind, margin="rows");
 		A2 = (t(L_nonzero) %*% L_nonzero) + (as.scalar(col_nonzeros[j,1]) * lambda_I); # coefficient matrix
 		R[j,] = t(solve (A2, t(Vt[j,] %*% L)));
 	}

 	# check for convergence
 	if (check) {
 		loss_cur = sum (V_nonzero_ind * (V - (L %*% t(R)))^2) + lambda * (sum ((L^2) * row_nonzeros) + sum ((R^2) * col_nonzeros));
 		loss_dec = (loss_init - loss_cur) / loss_init;
 		print ("Train loss at iteration (R) " + it + ": " + loss_cur + " loss-dec " + loss_dec);
 		if (loss_dec >= 0 & loss_dec < thr | loss_init == 0) {
 			print ("----- ALS converged after " + it + " iterations!");
 			converged = TRUE;
 		}
 		loss_init = loss_cur;
 	}
 } # end of while loop

 if (check) {
 	print ("-----	Final train loss: " + loss_init + " -----");
 }

 if (!converged) {
    print ("Max iteration achieved but not converged!");
 }

 # inject 0s in L if original V had empty rows
 if (num_zero_rows > 0) {
 	L = removeEmpty (target = diag (orig_nonzero_rows_ind), margin = "cols") %*% L;
 }
 # inject 0s in R if original V had empty rows
 if (num_zero_cols > 0) {
 	R = removeEmpty (target = diag (orig_nonzero_cols_ind), margin = "cols") %*% R;
 }
 Rt = t (R);
 write (L, fileL, format=fmtO);
 write (Rt, fileR, format=fmtO);
	#-------------------------------------------------------------
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#
	#-------------------------------------------------------------

	#
	# THIS SCRIPT COMPUTES AN APPROXIMATE FACTORIZATIONOF A LOW-RANK MATRIX V INTO TWO MATRICES L AND R
	# USING ALTERNATING-LEAST-SQUARES (ALS) ALGORITHM
	# MATRICES L AND R ARE COMPUTED BY MINIMIZING A LOSS FUNCTION (WITH REGULARIZATION)
	#
	# INPUT PARAMETERS:
	# ---------------------------------------------------------------------------------------------
	# NAME TYPE DEFAULT MEANING
	# ---------------------------------------------------------------------------------------------
	# V String --- Location to read the input matrix V to be factorized
	# L String --- Location to write the factor matrix L
	# R String --- Location to write the factor matrix R
	# rank Int 10 Rank of the factorization
	# reg String "L2" Regularization:
	# "L2" = L2 regularization;
	# "wL2" = weighted L2 regularization
	# lambda Double 0.0 Regularization parameter, no regularization if 0.0
	# maxi Int 50 Maximum number of iterations
	# check Boolean FALSE Check for convergence after every iteration, i.e., updating L and R once
	# thr Double 0.0001 Assuming check is set to TRUE, the algorithm stops and convergence is declared
	# if the decrease in loss in any two consecutive iterations falls below this threshold;
	# if check is FALSE thr is ignored
	# fmt String "text" The output format of the factor matrices L and R, such as "text" or "csv"
	# ---------------------------------------------------------------------------------------------
	# OUTPUT:
	# 1- An m x r matrix L, where r is the factorization rank
	# 2- An r x n matrix R
	#
	# HOW TO INVOKE THIS SCRIPT - EXAMPLE:
	# hadoop jar SystemML.jar -f ALS.dml -nvargs V=INPUT_DIR/V L=OUTPUT_DIR/L R=OUTPUT_DIR/R rank=10 reg="L2" lambda=0.0001 fmt=csv

	fileV = $V;
	fileL = $L;
	fileR = $R;

	# Default values of some parameters
	r = ifdef ($rank, 10); # $rank=10;
	reg = ifdef ($reg, "L2") # $reg="L2";
	lambda = ifdef ($lambda, 0.000001); # $lambda=0.000001;
	max_iter = ifdef ($maxi, 50); # $maxi=50;
	check = ifdef ($check, FALSE); # $check=FALSE;
	thr = ifdef ($thr, 0.0001); # $thr=0.0001;
	fmtO = ifdef ($fmt, "text"); # $fmt="text";

	V = read (fileV);


	# check the input matrix V, if some rows or columns contain only zeros remove them from V
	V_nonzero_ind = V != 0;
	row_nonzeros = rowSums (V_nonzero_ind);
	col_nonzeros = t (colSums (V_nonzero_ind));
	orig_nonzero_rows_ind = row_nonzeros != 0;
	orig_nonzero_cols_ind = col_nonzeros != 0;
	num_zero_rows = nrow (V) - sum (orig_nonzero_rows_ind);
	num_zero_cols = ncol (V) - sum (orig_nonzero_cols_ind);
	if (num_zero_rows > 0) {
	print ("Matrix V contains empty rows! These rows will be removed.");
	V = removeEmpty (target = V, margin = "rows");
	}
	if (num_zero_cols > 0) {
	print ("Matrix V contains empty columns! These columns will be removed.");
	V = removeEmpty (target = V, margin = "cols");
	}
	if (num_zero_rows > 0 \| num_zero_cols > 0) {
	print ("Recomputing nonzero rows and columns!");
	V_nonzero_ind = V != 0;
	row_nonzeros = rowSums (V_nonzero_ind);
	col_nonzeros = t (colSums (V_nonzero_ind));
	}

	###### MAIN PART ######
	m = nrow (V);
	n = ncol (V);

	# initializing factor matrices
	L = rand (rows = m, cols = r, min = -0.5, max = 0.5);
	R = rand (rows = n, cols = r, min = -0.5, max = 0.5);

	# initializing transformed matrices
	Vt = t(V);

	# check for regularization
	if (reg == "L2") {
	print ("BEGIN ALS SCRIPT WITH NONZERO SQUARED LOSS + L2 WITH LAMBDA - " + lambda);
	} else if (reg == "wL2") {
	print ("BEGIN ALS SCRIPT WITH NONZERO SQUARED LOSS + WEIGHTED L2 WITH LAMBDA - " + lambda);
	} else {
	stop ("wrong regularization! " + reg);
	}

	if (check) {
	loss_init = sum (V_nonzero_ind * (V - (L %% t(R)))^2) + lambda (sum ((L^2) * row_nonzeros) + sum ((R^2) * col_nonzeros));
	print ("----- Initial train loss: " + loss_init + " -----");
	}

	lambda_I = diag (matrix (lambda, rows = r, cols = 1));
	it = 0;
	converged = FALSE;
	while ((it < max_iter) & (!converged)) {
	it = it + 1;
	# keep R fixed and update L
	parfor (i in 1:m) {
	R_nonzero_ind = t(V[i,] != 0);
	R_nonzero = removeEmpty (target=R * R_nonzero_ind, margin="rows");
	A1 = (t(R_nonzero) %% R_nonzero) + (as.scalar(row_nonzeros[i,1]) lambda_I); # coefficient matrix
	L[i,] = t(solve (A1, t(V[i,] %*% R)));
	}

	# keep L fixed and update R
	parfor (j in 1:n) {
	L_nonzero_ind = t(Vt[j,] != 0)
	L_nonzero = removeEmpty (target=L * L_nonzero_ind, margin="rows");
	A2 = (t(L_nonzero) %% L_nonzero) + (as.scalar(col_nonzeros[j,1]) lambda_I); # coefficient matrix
	R[j,] = t(solve (A2, t(Vt[j,] %*% L)));
	}

	# check for convergence
	if (check) {
	loss_cur = sum (V_nonzero_ind * (V - (L %% t(R)))^2) + lambda (sum ((L^2) * row_nonzeros) + sum ((R^2) * col_nonzeros));
	loss_dec = (loss_init - loss_cur) / loss_init;
	print ("Train loss at iteration (R) " + it + ": " + loss_cur + " loss-dec " + loss_dec);
	if (loss_dec >= 0 & loss_dec < thr \| loss_init == 0) {
	print ("----- ALS converged after " + it + " iterations!");
	converged = TRUE;
	}
	loss_init = loss_cur;
	}
	} # end of while loop

	if (check) {
	print ("----- Final train loss: " + loss_init + " -----");
	}

	if (!converged) {
	print ("Max iteration achieved but not converged!");
	}

	# inject 0s in L if original V had empty rows
	if (num_zero_rows > 0) {
	L = removeEmpty (target = diag (orig_nonzero_rows_ind), margin = "cols") %*% L;
	}
	# inject 0s in R if original V had empty rows
	if (num_zero_cols > 0) {
	R = removeEmpty (target = diag (orig_nonzero_cols_ind), margin = "cols") %*% R;
	}
	Rt = t (R);
	write (L, fileL, format=fmtO);
	write (Rt, fileR, format=fmtO);