| #------------------------------------------------------------- |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| #------------------------------------------------------------- |
| |
| # |
| # THIS SCRIPT COMPUTES THE RATING/SCORE FOR A GIVEN LIST OF PAIRS: (USER-ID, ITEM-ID) USING 2 FACTOR MATRICES L AND R |
| # WE ASSUME THAT ALL USERS HAVE RATED AT LEAST ONCE AND ALL ITEMS HAVE BEEN RATED AT LEAST ONCE. |
| # INPUT PARAMETERS: |
| # --------------------------------------------------------------------------------------------- |
| # NAME TYPE DEFAULT MEANING |
| # --------------------------------------------------------------------------------------------- |
| # X String --- Location to read the input user-ids list |
| # Y String --- Location to write the output of top-K prediction: |
| # - top-K item-ids will be stored at Y |
| # - the corresponding top-K ratings will be stored at Y+".ratings" |
| # L String --- Location of factor matrix L: user-id x feature-id |
| # R String --- Location of factor matrix R: feature-id x item-id |
| # V String --- Location of original matrix V: user-id x item-id |
| # K Int 5 The number of top-K items |
| # fmt String "text" The output format of the factor matrix user-id/item-id/score |
| # --------------------------------------------------------------------------------------------- |
| # OUTPUT: |
| # 1- A matrix containing the top-K item-ids with highest predicted ratings for the users specified in the input matrix X |
| # 2- A matrix containing the top-K predicted ratings for the users specified in the input matrix X |
| # |
| # HOW TO INVOKE THIS SCRIPT - EXAMPLE: |
| # hadoop jar system-ds.jar -f ALS-topk-predict.dml -nvargs X=INPUT_DIR/X L=INPUT_DIR/L R=INPUT_DIR/R V=INTPUT_DIR/V.mtx |
| # Y=OUTPUT_DIR/Y K=5 fmt=csv |
| |
| fileX = $X; |
| fileY = $Y; |
| fileL = $L; |
| fileR = $R; |
| fileV = $V; |
| K = ifdef ($K, 5); |
| fmtO = ifdef ($fmt, "text"); # $fmt="text"; |
| |
| X = read (fileX); |
| L = read (fileL); |
| R = read (fileR); |
| V = read (fileV); |
| |
| Vrows = nrow(V); |
| Vcols = ncol(V); |
| |
| zero_cols_ind = (colSums (R != 0)) == 0; |
| K = min (Vcols - sum (zero_cols_ind), K); |
| |
| n = nrow(X); |
| |
| Lrows = nrow(L); |
| Rcols = ncol(R); |
| |
| X_user_max = max(X[,1]); |
| |
| if (X_user_max > Vrows) { |
| stop ("Predictions cannot be provided. Maximum user-id exceeds the number of rows of V."); |
| } |
| if (Lrows != Vrows | Rcols != Vcols) { |
| stop ("Predictions cannot be provided. Number of rows of L (columns of R) does not match the number of rows (column) of V."); |
| } |
| |
| |
| # creats projection matrix to select users |
| s = seq(1, n); |
| ones = matrix (1, rows = n, cols = 1); |
| projection_matrix = table(s, X[,1], ones, n, Lrows); |
| |
| # selects users from factor L |
| U_prime = projection_matrix %*% L; |
| |
| # calculates V_filter for selected users |
| V_filter = U_prime %*% R; |
| |
| # selects users from original V |
| V_prime = projection_matrix %*% V; |
| |
| # filter for already recommended items |
| V_prime = V_prime == 0; |
| |
| # removes already recommended items and creating user2item matrix |
| V_filter = V_prime * V_filter; |
| |
| |
| # stores sorted movies for selected users |
| V_top_indices = matrix(0, rows = nrow (V_filter), cols = K); |
| V_top_values = matrix(0, rows = nrow (V_filter), cols = K); |
| |
| # a large number to mask the max ratings |
| range = max (V_filter) - min (V_filter) + 1; |
| |
| # uses rowIndexMax/rowMaxs to update kth ratings |
| for (i in 1:K){ |
| rowIndexMax = rowIndexMax (V_filter); |
| rowMaxs = rowMaxs (V_filter); |
| V_top_indices[,i] = rowIndexMax; |
| V_top_values[,i] = rowMaxs; |
| V_filter = V_filter - range * table (seq (1, nrow (V_filter), 1), rowIndexMax, nrow(V_filter), ncol(V_filter)); |
| } |
| |
| V_top_indices = V_top_indices * (V_top_values > 0); |
| |
| # cbind users as a first column |
| V_top_indices = cbind (X[,1], V_top_indices); |
| V_top_values = cbind (X[,1], V_top_values); |
| |
| # writing top K elements |
| write (V_top_indices, fileY, format = fmtO); |
| write(V_top_values, fileY+".ratings", format = fmtO); |