blob: c2a5e3ab0e2b356f93a26bd301cf8a0fb530aac1 [file] [log] [blame]
 \begin{comment} Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. \end{comment} \subsection{Matrix Completion via Alternating Minimizations} \label{matrix_completion} \noindent{\bf Description} \smallskip Low-rank matrix completion is an effective technique for statistical data analysis widely used in the data mining and machine learning applications. Matrix completion is a variant of low-rank matrix factorization with the goal of recovering a partially observed and potentially noisy matrix from a subset of its revealed entries. Perhaps the most popular applications in which matrix completion has been successfully applied is in the context of collaborative filtering in recommender systems. In this setting, the rows in the data matrix correspond to users, the columns to items such as movies, and entries to feedback provided by users for items. The goal is to predict missing entries of the rating matrix. This implementation uses the alternating least-squares (ALS) technique for solving large-scale matrix completion problems.\\ \smallskip \noindent{\bf Usage} \smallskip {\hangindent=\parindent\noindent\it% {\tt{}-f }path/\/{\tt{}ALS.dml} {\tt{} -nvargs} {\tt{} V=}path/file {\tt{} L=}path/file {\tt{} R=}path/file % {\tt{} VO=}path/file {\tt{} rank=}int {\tt{} reg=}L2$\mid$wL2%regularization {\tt{} lambda=}double {\tt{} fmt=}format } \smallskip \noindent{\bf Arguments} \begin{Description} \item[{\tt V}:] Location (on HDFS) to read the input (user-item) matrix $V$ to be factorized \item[{\tt L}:] Location (on HDFS) to write the left (user) factor matrix $L$ \item[{\tt R}:] Location (on HDFS) to write the right (item) factor matrix $R$ % \item[{\tt VO}:] % Location (on HDFS) to write the input matrix $VO$ with empty rows and columns removed (if there are any) \item[{\tt rank}:] (default:\mbox{ }{\tt 10}) Rank of the factorization \item[{\tt reg}] (default:\mbox{ }{\tt L2}) Regularization:\\ {\tt L2} = L2 regularization;\\ {\tt wL2} = weighted L2 regularization;\\ if {\tt reg} is not provided no regularization will be performed. \item[{\tt lambda}:] (default:\mbox{ }{\tt 0.000001}) Regularization parameter \item[{\tt maxi}:] (default:\mbox{ }{\tt 50}) Maximum number of iterations \item[{\tt check}:] (default:\mbox{ }{\tt FALSE}) Check for convergence after every iteration, i.e., updating $L$ and $R$ once \item[{\tt thr}:] (default:\mbox{ }{\tt 0.0001}) Assuming {\tt check=TRUE}, the algorithm stops and convergence is declared if the decrease in loss in any two consecutive iterations falls below threshold {\tt thr}; if {\tt check=FALSE} parameter {\tt thr} is ignored. \item[{\tt fmt}:] (default:\mbox{ }{\tt "text"}) Matrix file output format, such as {\tt text}, {\tt mm}, or {\tt csv} \end{Description} \smallskip \noindent{\bf Usage: ALS Prediction/Top-K Prediction} \smallskip {\hangindent=\parindent\noindent\it% {\tt{}-f }path/\/{\tt{}ALS\_predict.dml} {\tt{} -nvargs} {\tt{} X=}path/file {\tt{} Y=}path/file {\tt{} L=}path/file {\tt{} R=}path/file {\tt{} Vrows=}int {\tt{} Vcols=}int {\tt{} fmt=}format }\smallskip \smallskip {\hangindent=\parindent\noindent\it% {\tt{}-f }path/\/{\tt{}ALS\_topk\_predict.dml} {\tt{} -nvargs} {\tt{} X=}path/file {\tt{} Y=}path/file {\tt{} L=}path/file {\tt{} R=}path/file {\tt{} V=}path/file {\tt{} K=}int {\tt{} fmt=}format }\smallskip % \noindent{\bf Arguments --- Prediction} % \begin{Description} % \item[{\tt X}:] % Location (on HDFS) to read the input matrix $X$ containing user-ids (first column) and item-ids (second column) % \item[{\tt L}:] % Location (on HDFS) to read the left (user) factor matrix $L$ % \item[{\tt R}:] % Location (on HDFS) to read the right (item) factor matrix $R$ % \item[{\tt Y}:] % Location (on HDFS) to write the output matrix $Y$ containing user-ids (first column), item-ids (second column) and predicted ratings (third column) % \item[{\tt Vrows}:] % Number of rows of the user-item matrix $V$ % \item[{\tt Vcols}] % Number of columns of the user-item matrix $V$ % \item[{\tt fmt}:] (default:\mbox{ }{\tt "text"}) % Matrix file output format, such as {\tt text}, {\tt mm}, or {\tt csv} % \end{Description} \noindent{\bf Arguments --- Prediction/Top-K Prediction} \begin{Description} \item[{\tt V}:] Location (on HDFS) to read the user-item matrix $V$ \item[{\tt X}:] Location (on HDFS) to read the input matrix $X$ with following format: \begin{itemize} \item for {ALS\_predict.dml}: a 2-column matrix that contains the user-ids (first column) and the item-ids (second column), \item for {ALS\_topk\_predict.dml}: a 1-column matrix that contains the user-ids. \end{itemize} \item[{\tt Y}:] Location (on HDFS) to write the output of prediction with the following format: \begin{itemize} \item for {ALS\_predict.dml}: a 3-column matrix that contains the user-ids (first column), the item-ids (second column) and the predicted ratings (third column), \item for {ALS\_topk\_predict.dml}: a ($K+1$)-column matrix that contains the user-ids in the first column and the top-K item-ids in the remaining $K$ columns will be stored at {\tt Y}. Additionally, a matrix with the same dimensions that contains the corresponding actual top-K ratings will be stored at {\tt Y.ratings}; see below for details. \end{itemize} % Note the following output format in predicting top-K items. % For a user with no available ratings in $V$ no % top-K items will be provided, i.e., the corresponding row in $Y$ will contains 0s. % Moreover, \$K'