scripts/algorithms/ALS_topk_predict.dml - systemds - Git at Google

 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 #-------------------------------------------------------------

 #
 # THIS SCRIPT COMPUTES THE RATING/SCORE FOR A GIVEN LIST OF PAIRS: (USER-ID, ITEM-ID) USING 2 FACTOR MATRICES L AND R
 # WE ASSUME THAT ALL USERS HAVE RATED AT LEAST ONCE AND ALL ITEMS HAVE BEEN RATED AT LEAST ONCE.
 # INPUT   PARAMETERS:
 # ---------------------------------------------------------------------------------------------
 # NAME    TYPE     DEFAULT  MEANING
 # ---------------------------------------------------------------------------------------------
 # X       String   ---      Location to read the input user-ids list
 # Y	 	  String   ---	    Location to write the output of top-K prediction:
 #							 - top-K item-ids will be stored at Y
 #							 - the corresponding top-K ratings will be stored at Y+".ratings"
 # L       String   ---      Location of factor matrix L: user-id x feature-id
 # R       String   ---      Location of factor matrix R: feature-id x item-id
 # V	  	  String   ---      Location of original matrix V: user-id x item-id
 # K	  	  Int      5	    The number of top-K items
 # fmt     String   "text"   The output format of the factor matrix user-id/item-id/score
 # ---------------------------------------------------------------------------------------------
 # OUTPUT:
 # 1- A matrix containing the top-K item-ids with highest predicted ratings for the users specified in the input matrix X
 # 2- A matrix containing the top-K predicted ratings for the users specified in the input matrix X
 #
 # HOW TO INVOKE THIS SCRIPT - EXAMPLE:
 # hadoop jar system-ds.jar -f ALS-topk-predict.dml -nvargs X=INPUT_DIR/X L=INPUT_DIR/L R=INPUT_DIR/R V=INTPUT_DIR/V.mtx
 #													Y=OUTPUT_DIR/Y K=5 fmt=csv

 fileX      = $X;
 fileY 	   = $Y;
 fileL	   = $L;
 fileR      = $R;
 fileV	   = $V;
 K	  	   = ifdef ($K, 5);
 fmtO       = ifdef ($fmt, "text");    # $fmt="text";

 X = read (fileX);
 L = read (fileL);
 R = read (fileR);
 V = read (fileV);

 Vrows = nrow(V);
 Vcols = ncol(V);

 zero_cols_ind = (colSums (R != 0)) == 0;
 K = min (Vcols - sum (zero_cols_ind), K);

 n = nrow(X);

 Lrows = nrow(L);
 Rcols = ncol(R);

 X_user_max = max(X[,1]);

 if (X_user_max > Vrows) {
 	stop ("Predictions cannot be provided. Maximum user-id exceeds the number of rows of V.");
 }
 if (Lrows != Vrows | Rcols !=  Vcols) {
 	stop ("Predictions cannot be provided. Number of rows of L (columns of R) does not match the number of rows (column) of V.");
 }


 # creats projection matrix to select users
 s = seq(1, n);
 ones = matrix (1, rows = n, cols = 1);
 projection_matrix = table(s, X[,1], ones, n, Lrows);

 # selects users from factor L
 U_prime = projection_matrix %*% L;

 # calculates V_filter for selected users
 V_filter = U_prime %*% R;

 # selects users from original V
 V_prime = projection_matrix %*% V;

 # filter for already recommended items
 V_prime = V_prime == 0;

 # removes already recommended items and creating user2item matrix
 V_filter = V_prime * V_filter;


 # stores sorted movies for selected users
 V_top_indices = matrix(0, rows = nrow (V_filter), cols = K);
 V_top_values = matrix(0, rows = nrow (V_filter), cols = K);

 # a large number to mask the max ratings
 range = max (V_filter) - min (V_filter) + 1;

 # uses rowIndexMax/rowMaxs to update kth ratings
 for (i in 1:K){
 	rowIndexMax = rowIndexMax (V_filter);
 	rowMaxs = rowMaxs (V_filter);
 	V_top_indices[,i] = rowIndexMax;
 	V_top_values[,i] = rowMaxs;
 	V_filter = V_filter - range * table (seq (1, nrow (V_filter), 1), rowIndexMax, nrow(V_filter), ncol(V_filter));
 }

 V_top_indices = V_top_indices * (V_top_values > 0);

 # cbind users as a first column
 V_top_indices = cbind (X[,1], V_top_indices);
 V_top_values = cbind (X[,1], V_top_values);

 # writing top K elements
 write (V_top_indices, fileY, format = fmtO);
 write(V_top_values, fileY+".ratings", format = fmtO);
	#-------------------------------------------------------------
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#
	#-------------------------------------------------------------

	#
	# THIS SCRIPT COMPUTES THE RATING/SCORE FOR A GIVEN LIST OF PAIRS: (USER-ID, ITEM-ID) USING 2 FACTOR MATRICES L AND R
	# WE ASSUME THAT ALL USERS HAVE RATED AT LEAST ONCE AND ALL ITEMS HAVE BEEN RATED AT LEAST ONCE.
	# INPUT PARAMETERS:
	# ---------------------------------------------------------------------------------------------
	# NAME TYPE DEFAULT MEANING
	# ---------------------------------------------------------------------------------------------
	# X String --- Location to read the input user-ids list
	# Y String --- Location to write the output of top-K prediction:
	# - top-K item-ids will be stored at Y
	# - the corresponding top-K ratings will be stored at Y+".ratings"
	# L String --- Location of factor matrix L: user-id x feature-id
	# R String --- Location of factor matrix R: feature-id x item-id
	# V String --- Location of original matrix V: user-id x item-id
	# K Int 5 The number of top-K items
	# fmt String "text" The output format of the factor matrix user-id/item-id/score
	# ---------------------------------------------------------------------------------------------
	# OUTPUT:
	# 1- A matrix containing the top-K item-ids with highest predicted ratings for the users specified in the input matrix X
	# 2- A matrix containing the top-K predicted ratings for the users specified in the input matrix X
	#
	# HOW TO INVOKE THIS SCRIPT - EXAMPLE:
	# hadoop jar system-ds.jar -f ALS-topk-predict.dml -nvargs X=INPUT_DIR/X L=INPUT_DIR/L R=INPUT_DIR/R V=INTPUT_DIR/V.mtx
	# Y=OUTPUT_DIR/Y K=5 fmt=csv

	fileX = $X;
	fileY = $Y;
	fileL = $L;
	fileR = $R;
	fileV = $V;
	K = ifdef ($K, 5);
	fmtO = ifdef ($fmt, "text"); # $fmt="text";

	X = read (fileX);
	L = read (fileL);
	R = read (fileR);
	V = read (fileV);

	Vrows = nrow(V);
	Vcols = ncol(V);

	zero_cols_ind = (colSums (R != 0)) == 0;
	K = min (Vcols - sum (zero_cols_ind), K);

	n = nrow(X);

	Lrows = nrow(L);
	Rcols = ncol(R);

	X_user_max = max(X[,1]);

	if (X_user_max > Vrows) {
	stop ("Predictions cannot be provided. Maximum user-id exceeds the number of rows of V.");
	}
	if (Lrows != Vrows \| Rcols != Vcols) {
	stop ("Predictions cannot be provided. Number of rows of L (columns of R) does not match the number of rows (column) of V.");
	}


	# creats projection matrix to select users
	s = seq(1, n);
	ones = matrix (1, rows = n, cols = 1);
	projection_matrix = table(s, X[,1], ones, n, Lrows);

	# selects users from factor L
	U_prime = projection_matrix %*% L;

	# calculates V_filter for selected users
	V_filter = U_prime %*% R;

	# selects users from original V
	V_prime = projection_matrix %*% V;

	# filter for already recommended items
	V_prime = V_prime == 0;

	# removes already recommended items and creating user2item matrix
	V_filter = V_prime * V_filter;


	# stores sorted movies for selected users
	V_top_indices = matrix(0, rows = nrow (V_filter), cols = K);
	V_top_values = matrix(0, rows = nrow (V_filter), cols = K);

	# a large number to mask the max ratings
	range = max (V_filter) - min (V_filter) + 1;

	# uses rowIndexMax/rowMaxs to update kth ratings
	for (i in 1:K){
	rowIndexMax = rowIndexMax (V_filter);
	rowMaxs = rowMaxs (V_filter);
	V_top_indices[,i] = rowIndexMax;
	V_top_values[,i] = rowMaxs;
	V_filter = V_filter - range * table (seq (1, nrow (V_filter), 1), rowIndexMax, nrow(V_filter), ncol(V_filter));
	}

	V_top_indices = V_top_indices * (V_top_values > 0);

	# cbind users as a first column
	V_top_indices = cbind (X[,1], V_top_indices);
	V_top_values = cbind (X[,1], V_top_values);

	# writing top K elements
	write (V_top_indices, fileY, format = fmtO);
	write(V_top_values, fileY+".ratings", format = fmtO);