blob: c2a5e3ab0e2b356f93a26bd301cf8a0fb530aac1 [file] [log] [blame]
\begin{comment}
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
\end{comment}
\subsection{Matrix Completion via Alternating Minimizations}
\label{matrix_completion}
\noindent{\bf Description}
\smallskip
Low-rank matrix completion is an effective technique for statistical data analysis widely used in the data mining and machine learning applications.
Matrix completion is a variant of low-rank matrix factorization with the goal of recovering a partially observed and potentially noisy matrix from a subset of its revealed entries.
Perhaps the most popular applications in which matrix completion has been successfully applied is in the context of collaborative filtering in recommender systems.
In this setting, the rows in the data matrix correspond to users,
the columns to items such as movies, and entries to feedback provided by users for items.
The goal is to predict missing entries of the rating matrix.
This implementation uses the alternating least-squares (ALS) technique for solving large-scale matrix completion problems.\\
\smallskip
\noindent{\bf Usage}
\smallskip
{\hangindent=\parindent\noindent\it%
{\tt{}-f }path/\/{\tt{}ALS.dml}
{\tt{} -nvargs}
{\tt{} V=}path/file
{\tt{} L=}path/file
{\tt{} R=}path/file
% {\tt{} VO=}path/file
{\tt{} rank=}int
{\tt{} reg=}L2$\mid$wL2%regularization
{\tt{} lambda=}double
{\tt{} fmt=}format
}
\smallskip
\noindent{\bf Arguments}
\begin{Description}
\item[{\tt V}:]
Location (on HDFS) to read the input (user-item) matrix $V$ to be factorized
\item[{\tt L}:]
Location (on HDFS) to write the left (user) factor matrix $L$
\item[{\tt R}:]
Location (on HDFS) to write the right (item) factor matrix $R$
% \item[{\tt VO}:]
% Location (on HDFS) to write the input matrix $VO$ with empty rows and columns removed (if there are any)
\item[{\tt rank}:] (default:\mbox{ }{\tt 10})
Rank of the factorization
\item[{\tt reg}] (default:\mbox{ }{\tt L2})
Regularization:\\
{\tt L2} = L2 regularization;\\
{\tt wL2} = weighted L2 regularization;\\
if {\tt reg} is not provided no regularization will be performed.
\item[{\tt lambda}:] (default:\mbox{ }{\tt 0.000001})
Regularization parameter
\item[{\tt maxi}:] (default:\mbox{ }{\tt 50})
Maximum number of iterations
\item[{\tt check}:] (default:\mbox{ }{\tt FALSE})
Check for convergence after every iteration, i.e., updating $L$ and $R$ once
\item[{\tt thr}:] (default:\mbox{ }{\tt 0.0001})
Assuming {\tt check=TRUE}, the algorithm stops and convergence is declared
if the decrease in loss in any two consecutive iterations falls below threshold {\tt thr};
if {\tt check=FALSE} parameter {\tt thr} is ignored.
\item[{\tt fmt}:] (default:\mbox{ }{\tt "text"})
Matrix file output format, such as {\tt text}, {\tt mm}, or {\tt csv}
\end{Description}
\smallskip
\noindent{\bf Usage: ALS Prediction/Top-K Prediction}
\smallskip
{\hangindent=\parindent\noindent\it%
{\tt{}-f }path/\/{\tt{}ALS\_predict.dml}
{\tt{} -nvargs}
{\tt{} X=}path/file
{\tt{} Y=}path/file
{\tt{} L=}path/file
{\tt{} R=}path/file
{\tt{} Vrows=}int
{\tt{} Vcols=}int
{\tt{} fmt=}format
}\smallskip
\smallskip
{\hangindent=\parindent\noindent\it%
{\tt{}-f }path/\/{\tt{}ALS\_topk\_predict.dml}
{\tt{} -nvargs}
{\tt{} X=}path/file
{\tt{} Y=}path/file
{\tt{} L=}path/file
{\tt{} R=}path/file
{\tt{} V=}path/file
{\tt{} K=}int
{\tt{} fmt=}format
}\smallskip
% \noindent{\bf Arguments --- Prediction}
% \begin{Description}
% \item[{\tt X}:]
% Location (on HDFS) to read the input matrix $X$ containing user-ids (first column) and item-ids (second column)
% \item[{\tt L}:]
% Location (on HDFS) to read the left (user) factor matrix $L$
% \item[{\tt R}:]
% Location (on HDFS) to read the right (item) factor matrix $R$
% \item[{\tt Y}:]
% Location (on HDFS) to write the output matrix $Y$ containing user-ids (first column), item-ids (second column) and predicted ratings (third column)
% \item[{\tt Vrows}:]
% Number of rows of the user-item matrix $V$
% \item[{\tt Vcols}]
% Number of columns of the user-item matrix $V$
% \item[{\tt fmt}:] (default:\mbox{ }{\tt "text"})
% Matrix file output format, such as {\tt text}, {\tt mm}, or {\tt csv}
% \end{Description}
\noindent{\bf Arguments --- Prediction/Top-K Prediction}
\begin{Description}
\item[{\tt V}:]
Location (on HDFS) to read the user-item matrix $V$
\item[{\tt X}:]
Location (on HDFS) to read the input matrix $X$ with following format:
\begin{itemize}
\item for {ALS\_predict.dml}: a 2-column matrix that contains the user-ids (first column) and the item-ids (second column),
\item for {ALS\_topk\_predict.dml}: a 1-column matrix that contains the user-ids.
\end{itemize}
\item[{\tt Y}:]
Location (on HDFS) to write the output of prediction with the following format:
\begin{itemize}
\item for {ALS\_predict.dml}: a 3-column matrix that contains the user-ids (first column), the item-ids (second column) and the predicted ratings (third column),
\item for {ALS\_topk\_predict.dml}: a ($K+1$)-column matrix that contains the user-ids in the first column and the top-K item-ids in the remaining $K$ columns will be stored at {\tt Y}.
Additionally, a matrix with the same dimensions that contains the corresponding actual top-K ratings will be stored at {\tt Y.ratings}; see below for details.
\end{itemize}
% Note the following output format in predicting top-K items.
% For a user with no available ratings in $V$ no
% top-K items will be provided, i.e., the corresponding row in $Y$ will contains 0s.
% Moreover, $K'<K$ items with highest predicted ratings will be provided for a user $i$
% if the number of missing ratings $K'$ (i.e., those with 0 value in $V$) for $i$ is less than $K$.
\item[{\tt L}:]
Location (on HDFS) to read the left (user) factor matrix $L$
\item[{\tt R}:]
Location (on HDFS) to write the right (item) factor matrix $R$
\item[{\tt Vrows}:]
Number of rows of $V$ (i.e., number of users)
\item[{\tt Vcols}]
Number of columns of $V$ (i.e., number of items)
\item[{\tt K}:] (default:\mbox{ }{\tt 5})
Number of top-K items for top-K prediction
\item[{\tt fmt}:] (default:\mbox{ }{\tt "text"})
Matrix file output format, such as {\tt text}, {\tt mm}, or {\tt csv}
\end{Description}
\noindent{\bf Details}
\smallskip
Given an $m \times n$ input matrix $V$ and a rank parameter $r \ll \min{(m,n)}$, low-rank matrix factorization seeks to find an $m \times r$ matrix $L$ and an $r \times n$ matrix $R$ such that $V \approx LR$, i.e., we aim to approximate $V$ by the low-rank matrix $LR$.
The quality of the approximation is determined by an application-dependent loss function $\mathcal{L}$. We aim at finding the loss-minimizing factor matrices, i.e.,
\begin{equation}\label{eq:problem}
(L^*, R^*) = \textrm{argmin}_{L,R}{\mathcal{L}(V,L,R)}.
\end{equation}
In the context of collaborative filtering in the recommender systems it is often the case that the input matrix $V$ contains several missing entries. Such entries are coded with the 0 value and the loss function is computed only based on the nonzero entries in $V$, i.e.,
\begin{equation*} %\label{eq:loss}
\mathcal{L}=\sum_{(i,j)\in\Omega}l(V_{ij},L_{i*},R_{*j}),
\end{equation*}
where $L_{i*}$ and $R_{*j}$, respectively, denote the $i$th row of $L$ and the $j$th column of $R$, $\Omega=\{\omega_1,\dots,\omega_N\}$ denotes the training set containing the observed (nonzero) entries in $V$, and $l$ is some local loss function.
%for some training set $\Omega$ that contains the observed (nonzero) entries in $V$ and some local loss function $l$. In the above formula,
ALS is an optimization technique that can be used to solve quadratic problems.
For matrix completion, the algorithm repeatedly keeps one of the unknown matrices ($L$ or $R$) fixed and optimizes the other one. In particular, ALS alternates between recomputing the rows of $L$ in one step and the columns of $R$ in the subsequent step.
Our implementation of the ALS algorithm supports the loss functions summarized in Table~\ref{tab:loss_functions} commonly used for matrix completion~\cite{ZhouWSP08:als}.
%
\begin{table}[t]
\centering
\label{tab:loss_functions}
\begin{tabular}{|ll|} \hline
Loss & Definition \\ \hline
% $\mathcal{L}_\text{Sl}$ & $\sum_{i,j} (V_{ij} - [LR]_{ij})^2$ \\
% $\mathcal{L}_\text{Sl+L2}$ & $\mathcal{L}_\text{Sl} + \lambda \Bigl( \sum_{ik} L_{ik}^2 + \sum_{kj} R_{kj}^2 \Bigr)$ \\
$\mathcal{L}_\text{Nzsl}$ & $\sum_{i,j:V_{ij}\neq 0} (V_{ij} - [LR]_{ij})^2$ \\
$\mathcal{L}_\text{Nzsl+L2}$ & $\mathcal{L}_\text{Nzsl} + \lambda \Bigl( \sum_{ik} L_{ik}^2 + \sum_{kj} R_{kj}^2 \Bigr)$ \\
$\mathcal{L}_\text{Nzsl+wL2}$ & $\mathcal{L}_\text{Nzsl} + \lambda \Bigl(\sum_{ik}N_{i*} L_{ik}^2 + \sum_{kj}N_{*j} R_{kj}^2 \Bigr)$ \\ \hline
\end{tabular}
\caption{Popular loss functions supported by our ALS implementation; $N_{i*}$ and $N_{*j}$, respectively, denote the number of nonzero entries in row $i$ and column $j$ of $V$.}
\end{table}
Note that the matrix completion problem as defined in (\ref{eq:problem}) is a non-convex problem for all loss functions from Table~\ref{tab:loss_functions}.
However, when fixing one of the matrices $L$ or $R$, we get a least-squares problem with a globally optimal solution.
For example, for the case of $\mathcal{L}_\text{Nzsl+wL2}$ we have the following closed form solutions
\begin{align*}
L^\top_{n+1,i*} &\leftarrow (R^{(i)}_n {[R^{(i)}_n]}^\top + \lambda N_2 I)^{-1} R_n V^\top_{i*}, \\
R_{n+1,*j} &\leftarrow ({[L^{(j)}_{n+1}]}^\top L^{(j)}_{n+1} + \lambda N_1 I)^{-1} L^\top_{n+1} V_{*j},
\end{align*}
where $L_{n+1,i*}$ (resp. $R_{n+1,*j}$) denotes the $i$th row of $L_{n+1}$ (resp. $j$th column of $R_{n+1}$), $\lambda$ denotes
the regularization parameter, $I$ is the identity matrix of appropriate dimensionality,
$V_{i*}$ (resp. $V_{*j}$) denotes the revealed entries in row $i$ (column $j$),
$R^{(i)}_n$ (resp. $L^{(j)}_{n+1}$) refers to the corresponding columns of $R_n$ (rows of $L_{n+1}$),
and $N_1$ (resp. $N_2$) denotes a diagonal matrix that contains the number of nonzero entries in row $i$ (column $j$) of $V$.
% For example, for the case of $\mathcal{L}_\text{Sl-L2}$ we have the following closed form solutions
% \begin{align*}
% L^\top_{n+1,i*} &\leftarrow (R_n {[R_n]}^\top + \lambda I)^{-1} R_n V^\top_{i*}, \\
% R_{n+1,*j} &\leftarrow ({[L_{n+1}]}^\top L_{n+1} + \lambda I)^{-1} L^\top_{n+1} V_{*j},
% \end{align*}
% where $L_{n+1,i*}$ (resp. $R_{n+1,*j}$) denotes the $i$th row of $L_{n+1}$ (resp. $j$th column of $R_{n+1}$), $\lambda$ denotes
% the regularization parameter and $I$ is the identity matrix of appropriate dimensionality.
% For the case of $\mathcal{L}_\text{Nzsl}$ we need to remove the equation that correspond to zero entries of $V$ from the least-squares problems.
% With wL2 we get the following equations
% \begin{align*}
% L^\top_{n+1,i*} &\leftarrow (R^{(i)}_n {[R^{(i)}_n]}^\top + \lambda N_2 I)^{-1} R_n V^\top_{i*}, \\
% R_{n+1,*j} &\leftarrow ({[L^{(j)}_{n+1}]}^\top L^{(j)}_{n+1} + \lambda N_1 I)^{-1} L^\top_{n+1} V_{*j},
% \end{align*}
% where $V_{i*}$ (resp. $V_{*j}$) denotes the revealed entries in row $i$ (column $j$),
% $R^{(i)}_n$ (resp. $L^{(j)}_{n+1}$) refers to the corresponding columns of $R_n$ (rows of $L_{n+1}$),
% and $N_1$ (resp. $N_2$) denotes a diagonal matrix that contains the number of nonzero entries in row $i$ (column $j$) of $V$.
\textbf{Prediction.}
Based on the factor matrices computed by ALS we provide two prediction scripts:
\begin{Enumerate}
\item {\tt ALS\_predict.dml} computes the predicted ratings for a given list of users and items;
\item {\tt ALS\_topk\_predict.dml} computes top-K item (where $K$ is given as input) with highest predicted ratings together with their corresponding ratings for a given list of users.
\end{Enumerate}
\smallskip
\noindent{\bf Returns}
\smallskip
We output the factor matrices $L$ and $R$ after the algorithm has converged. The algorithm is declared as converged if one of the two criteria is meet:
(1) the decrease in the value of loss function falls below {\tt thr}
given as an input parameter (if parameter {\tt check=TRUE}), or (2) the maximum number of iterations (defined as parameter {\tt maxi}) is reached.
Note that for a given user $i$ prediction is possible only if user $i$ has rated at least one item, i.e., row $i$ in matrix $V$ has at least one nonzero entry.
In case, some users have not rated any items the corresponding factor in $L$ will be all 0s.
Similarly if some items have not been rated at all the corresponding factors in $R$ will contain only 0s.
Our prediction scripts output the predicted ratings for a given list of users and items as well as the top-K items with highest predicted ratings together with the predicted ratings for a given list of users. Note that the predictions will only be provided for the users who have rated at least one item, i.e., the corresponding rows contain at least one nonzero entry.
% Moreover in the case of top-K prediction, if the number of predicted ratings---i.e., missing entries--- for some user $i$ is less than the input parameter $K$, all the predicted ratings for user $i$ will be provided.
\smallskip
\noindent{\bf Examples}
\smallskip
% {\hangindent=\parindent\noindent\tt
% \hml -f ALS.dml -nvargs V=/user/biadmin/V L=/user/biadmin/L R=/user/biadmin/R rank=10 reg="L2" lambda=0.0001 fmt=csv
%
% }
{\hangindent=\parindent\noindent\tt
\hml -f ALS.dml -nvargs V=/user/biadmin/V L=/user/biadmin/L R=/user/biadmin/R rank=10 reg="wL2" lambda=0.0001 maxi=50 check=TRUE thr=0.001 fmt=csv
}
\noindent To compute predicted ratings for a given list of users and items:
{\hangindent=\parindent\noindent\tt
\hml -f ALS-predict.dml -nvargs X=/user/biadmin/X Y=/user/biadmin/Y L=/user/biadmin/L R=/user/biadmin/R Vrows=100000 Vcols=10000 fmt=csv
}
\noindent To compute top-K items with highest predicted ratings together with the predicted ratings for a given list of users:
{\hangindent=\parindent\noindent\tt
\hml -f ALS-top-predict.dml -nvargs X=/user/biadmin/X Y=/user/biadmin/Y L=/user/biadmin/L R=/user/biadmin/R V=/user/biadmin/V K=10 fmt=csv
}
%
%\begin{itemize}
% \item Y. Zhou, D. K. Wilkinson, R. Schreiber, and R. Pan. \newblock{Large-scale parallel collaborative flitering for the Netflix prize}. In Proceedings of the International
% Conference on Algorithmic Aspects in Information and Management (AAIM), 2008, 337-348.
%\end{itemize}