blob: c3fd0ac270dd1afafc3847d5678e30e36b1e8d9f [file] [log] [blame]
% When using TeXShop on the Mac, let it know the root document.
% The following must be one of the first 20 lines.
% !TEX root = ../design.tex
\chapter{Latent Dirichlet Allocation (LDA)}
\begin{moduleinfo}
\item[Author] \href{mailto:shengwen.yang@emc.com}{Shengwen Yang} (version 0.6 only)
\item[History]
\begin{modulehistory}
\item[v0.6] Initial revision of design document, complete rewrite of module, standard parallel implementation (support for local model update tuple by tuple and global model update iteration by iteration)
\item[v0.1] Initial version (An approximated implementation which has memory problem for big datasets)
\end{modulehistory}
\end{moduleinfo}
LDA is a very popular technique for topic modeling. This module implements a parallel Gibbs sampling algorithm for LDA inference.
\section{Overview of LDA}
LDA\cite{Blei:2003} is a very popular technique for discovering the main themes or
topics from a large collection of unstructured documents and has been widely
applied to various fields, including text mining, computer vision, finance,
bioinformatics, cognitive science, music, and social sciences.
With LDA, a document can be represented as a random mixture of latent topics,
where each topic can be characterized by a probability distribution over a
vocabulary of words. Given a large text corpus, LDA will be able to infer a set
of latent topics from the corpus, each represented with a multinomial
distribution over words, denoted as $P(w|z)$, and infer the topic distribution
for each document, represented as a multinomial distribution over topics,
denoted as $P(z|d)$.
Several methods have been proposed for the inference of LDA, including
variational Bayesian, expectation propagation, and Gibbs
sampling\cite{griffiths04finding}. Among of these methods, Gibbs sampling is
the most widely used one because it is simple, fast, and has very few
adjustable parameters. Besides, Gibbs sampling is easy to parallelize and easy
to scale up, which allows us to utilize a cluster of machines to deal with very
big datasets.
\section{Gibbs Sampling for LDA}
\subsection{Overview}
Althouth the derivation of Gibbs sampling for LDA is complicated, the results are very simple. The following equation tells us how to sample a new topic for a word in a corpus:
\begin{equation}
P(z_i=k|\textbf{z}_{-i},\textbf{w}) \propto \frac{nwz_{-i,k}^{(w_i)} +\beta}{nz_{-i,k}+W \beta}\times(ndz_{-i,k}^{(d_i)}+\alpha)
\end{equation}
where:
\begin{itemize}
\item $i$ - index of word in the corpus
\item $d_i$ - docid of the $i_{th}$ word
\item $w_i$ - wordid of the $i_{th}$ word
\item $k$ - the $k_{th}$ topic, where $1 <= k <= T$, and $T$ is the number of topics
\item $z_i$ - topic assignment for the $i_{th}$ word
\item $\textbf{z}_{-i}$ - topic assignments for other words excluding the $i_{th}$ word
\item $\textbf{w}$ - all words in the corpus
\item $ndz$ - per-document topic counts
\item $nwz$ - per-word topic counts
\item $nz$ - corpus-level topic counts
\end{itemize}
According to this equation, we can update the topic assignment to each word sequnetially. This process can be iterated enough times until the conditional distribution reachs a stable state.
\subsection{Parallization}
The parallization of the above algirhtm is very straightforward. The basic idea is to distribute a large set of documents to a cluster of segment nodes and allow each segment node to do Gibbs sampling on a subset of documents locally. Note that at the end of each iteration, the local models generated on each segment node will be merged to generate a global model, which will be distributed to each segment node at the begining of next iteration.
Refer to \cite{Wang:2009} for a similar parallel implementation based on MPI and MapReduce.
\subsection{Formal Description}
\begin{algorithm}[gibbs-lda$(D, T, \alpha, \beta)$] \label{alg:gibbs-lda}
\alginput{
\\Dataset $D$,
\\topic number $T$,
\\prior on per-document topic distribution $\alpha$,
\\prior on per-word topic distribution $\beta$
}
\algoutput{
\\Per-document topic distribution $P(z|d)$,
\\per-word topic distribution $P(z|w)$
}
\begin{algorithmic}[1]
\State $ndz \set 0$
\State $nwz \set 0$
\State $nz \set 0$
\For{$d \in D$}
\For{$w \in W_d$}
\State $z \set \text{random}(T)$
\State $ndz[d,z] \set ndz[d,z] + 1$
\State $nwz[w,z] \set nwz[w,z] + 1$
\State $nz[z] \set nz[z] + 1$
\State $Z[d,w] \set z$
\EndFor
\EndFor
\Repeat
\For{$d \in D$}
\For{$w \in W_d$}
\State $z_{old} \set Z[d,w]$
\State $z_{new} \set \text{gibbs-sample}(z_{old}, ndz, nwz[w], nz, \alpha, \beta)$
\State $Z[d,w] \set z_{new}$
\State $ndz[d,z_{old}] \set ndz[d,z_{old}] - 1$
\State $nwz[w,z_{old}] \set nwz[w,z_{old}] - 1$
\State $nz[z_{old}] \set nz[z_{old}] - 1$
\State $ndz[d,z_{new}] \set ndz[d,z_{new}] + 1$
\State $nwz[w,z_{new}] \set nwz[w,z_{new}] + 1$
\State $nz[z_{new}] \set nz[z_{new}] + 1$
\EndFor
\EndFor
\Until {Stop condition is satisfied}
\State $P(z|d) \set \text{normalize}(ndz, \alpha)$
\State $P(z|w) \set \text{normalize}(nwz, \beta)$
\end{algorithmic}
\end{algorithm}
\begin{description}
\item[Parallelism] The inner loop is sequential.
The outer loop is data-parallel and model averaging is used.
\end{description}
\subsection{Implementation as User-Defined Function}
Algorithm\texttt{gibbs-lda} is implemented as the user-defined function \texttt{lda\_train}.
\begin{center}
\begin{tabularx}{\textwidth}{rlXl}
\toprule%
\textbf{Name} & \textbf{Description} & \textbf{Type}
\\\otoprule
In &
\texttt{data\_table} &
Table containing the training dataset &
Relation
\\\midrule
In &
\texttt{voc\_size} &
Size of vocabulary &
Integer
\\\midrule
In &
\texttt{topic\_num} &
Number of topics &
Integer
\\\midrule
In &
\texttt{iter\_num} &
Number of iterations &
Integer
\\\midrule
In &
\texttt{alpha} &
Prior on per-document topic distribution &
Double
\\\midrule
In &
\texttt{beta} &
Prior on per-word topic distribution &
Double
\\\midrule
Out &
\texttt{model\_table} &
Table containing the model information &
Relation
\\\midrule
Out &
\texttt{output\_data\_table} &
Table containing the per-document topic counts and topic assignments &
Relation
\\\bottomrule
\end{tabularx}
\end{center}
Internally, two work tables are used alternately in the iterative Gibbs sampling process, one as input, another as output. The key part of an iteration is implemented essentially using the following SQL:
\begin{sql}[emph={work_table_out,work_table_in,__newplda_gibbs_sample,__newplda_count_topic_agg,model}]
INSERT INTO work_table_out
SELECT
distid,
docid,
wordcount,
words,
counts,
madlib.__newplda_gibbs_sample(
words,
counts,
doc_topic,
model,
alpha,
beta,
voc_size,
topic_num)
FROM
(
SELECT
distid,
docid,
wordcount,
words,
counts,
doc_topic,
model
FROM
(
SELECT
madlib.__newplda_count_topic_agg(
words,
counts,
doc_topic[topic_num + 1:array_upper(doc_topic, 1)]
AS topic_assignment,
voc_size,
topic_num) model
FROM
work_table_in
) t1
JOIN
work_table_in
) t2
\end{sql}
Note that within the \texttt{madlib.\_\_newplda\_gibbs\_sample} function, the \texttt{model} parameter will be read in the first invocation and stored in the memory. In the incoming invocations within the same query, the parameter will be ignored. In this way, the model can be updated by an invocation and the updated model can be transferred to the next invocation.
The above SQL can be further rewritten to eliminate the data redundancy and reduce the overhead of joining operation, and thus improve the overall performance. This is very useful when the product of $voc\_size \times topic\_num$ is very large. See below for the rewritten SQL:
\begin{sql}[emph={work_table_out,work_table_in,__newplda_gibbs_sample,__newplda_count_topic_agg,model}]
INSERT INTO work_table_out
SELECT
distid,
docid,
wordcount,
words,
counts,
madlib.__newplda_gibbs_sample(
words,
counts,
doc_topic,
model,
alpha,
beta,
voc_size,
topic_num)
FROM
(
SELECT
dcz.distid,
dcz.docid,
dcz.wordcount,
dcz.words,
dcz.counts,
dcz.doc_topic,
chunk.model
FROM
(
SELECT
distid, docid, model
FROM
(
SELECT
madlib.__newplda_count_topic_agg(
words,
counts,
doc_topic[topic_num + 1:array_upper(doc_topic, 1)]
AS topic_assignment,
voc_size,
topic_num) model
FROM
work_table_in
) t1,
(
SELECT
distid,
min(docid) docid
FROM
work_table_in
GROUP BY distid
) t2 -- One row per-segment
) chunk -- Ideally only one row per-segment
RIGHT JOIN work_table_in dcz
ON (dcz.distid = chunk.distid AND dcz.docid = chunk.docid)
ORDER BY distid, docid -- Local data manipulation, no data redistribution
) joined -- Ideally only one row per-segment has the fully joined data
\end{sql}