blob: 24dc388880078aeda9642f45fcdaee7345b687d3 [file] [log] [blame]
/* ----------------------------------------------------------------------- *//**
*
* @file plda.sql_in
*
* @brief SQL functions for parallel Latent Dirichlet Allocation
* @date April 2011
*
* @sa For an introduction to Latent Dirichlet Allocation models, see the
module description \ref grp_plda.
*
*//* ------------------------------------------------------------------------*/
m4_include(`SQLCommon.m4')
/**
@addtogroup grp_plda
@about
Latent Dirichlet Allocation (LDA) is an interesting generative probabilistic
model for natural texts that has received a lot of attention in recent years.
The model is quite versatile, having found uses in problems like automated
topic discovery, collaborative filtering, and document classification.
The LDA model posits that each document is associated with a mixture of various
topics (e.g. a document is related to Topic 1 with probability 0.7, and Topic 2 with
probability 0.3), and that each word in the document is attributable to one
of the document's topics. There is a (symmetric) Dirichlet prior with parameter
\f$ \alpha \f$ on each document's topic mixture. In addition, there is another
(symmateric) Dirichlet prior with parameter \f$ \eta \f$ on the distribution
of words for each topic. The following generative process then defines a distribution
over a corpus of documents. First sample, for each topic \f$ i \f$, a per-topic word distribution
\f$ \Phi_i \f$ from the Dirichlet(\f$\eta\f$) prior.
Then for each document:
-# Sample a document length N from a suitable distribution, say, Poisson.
-# Sample a topic mixture \f$ \theta \f$ for the document from the Dirichlet(\f$\alpha\f$) distribution.
-# For each of the N words:
-# Sample a topic \f$ z_n \f$ from the multinomial topic distribution \f$ \theta \f$.
-# Sample a word \f$ w_n \f$ from the multinomial word distribution \f$ \Phi_{z_n} \f$ associated with topic \f$ z_n \f$.
In practice, only the words in each document are observable. The topic mixture of
each document and the topic for each word in each document are latent unobservable
variables that need to be inferred from the observables, and this is the problem
people refer to when they talk about the inference problem for LDA. Exact inference
is intractable, but several approximate inference algorithms for LDA have been
developed. The simple and effective Gibbs sampling algorithm described in
Griffiths and Steyvers [2] appears to be the current algorithm of choice. Our
parallel implementation of LDA comes from Wang et al [3], which is essentially
a straightforward parallelisation of the Gibbs sampling algorithm.
See also http://code.google.com/p/plda/.
@input
The \b corpus to be analyzed is expected to be of the following form:
<pre>{TABLE|VIEW} <em>datatable</em> (
<em>id</em> INTEGER,
<em>contents</em> INTEGER[],
...
)</pre>
where \c id refers to the document ID, and \c contents is an integer array that specifies the words in the document using the index from
the dictionary. Words must be represented using positive numbers.
The \b dictionary that indexes all the words found in the corpus is of the following form:
<pre>{TABLE|VIEW} <em>dicttable</em> (
<em>dict</em> TEXT[],
...
)</pre>
@usage
- Topic inference is achieved through the following UDF
<pre>
SELECT \ref plda_run('<em>datatable</em>', '<em>dicttable</em>', '<em>modeltable</em>', '<em>outputdatatable</em>',
<em>numiter</em>, <em>numtopics</em>, <em>alpha</em>, <em>eta</em>);
</pre>
This function stores the resulting model in <tt><em>outputdatatable</em></tt>.
- Labelling of test documents using a learned LDA model is achieved using the following UDF
<pre>
SELECT \ref plda_label_test_documents('<em>testtable</em>', '<em>outputtable</em>', '<em>modeltable</em>', '<em>dicttable</em>',
<em>numtopics</em>, <em>alpha</em>, <em>eta</em>);
</pre>
This creates the following table with the assigned topic for each word in the test corpus.
<pre>
id | contents | topics
----+-------------------------------+-----------------------------------
...
</pre>
@implementation
The input format for the Parallel LDA module is different from that used by the
`lda' package for R. In the `lda' package, each document is represented by two
equal dimensional integer arrays. The first array represents the words that occur
in the document, and the second array captures the number of times each word in
the first array occurs in the document.
This representation appears to have a major weakness in that all the occurrences
of a word in a document must be assigned the same topic, which is clearly not
satisfactory. Further, at the time of writing, the main learning function in the
`lda' package actually does not work correctly when the occurrence counts for words
aren't one.
There is a script called generateTestCases.cc that can be used to generate some
simple test documents to validate the correctness and effiency of the parallel
LDA implementation.
@examp
We now give a usage example.
-# As a first step, we need to prepare a corpus and dictionary in the appropriate structure.
\code
sql> CREATE TABLE MADLIB_SCHEMA.plda_mydict ( dict text[] ) DISTRIBUTED RANDOMLY;
sql> INSERT into MADLIB_SCHEMA.plda_mydict values
('{human,machine,interface,for,abc,computer,applications,a,survey,of,
user,opinion,system,response,time,the,eps,management,and,engineering,
testing,relation,perceived,to,error,generation,random,binary,order,tree,
intersection,graph,path,in,minor,IV,widths,well,quasi}');
sql> CREATE TABLE MADLIB_SCHEMA.plda_mycorpus ( id int4, contents int4[] );
sql> INSERT INTO MADLIB_SCHEMA.plda_mycorpus VALUES
(1, '{1,2,3,4,5,6,7}'),
(2, '{8,9,10,11,12,10,6,13,14,15}'),
(3, '{16,17,11,3,18,13}'),
(4, '{13,19,1,13,20,21,10,17}'),
(5, '{22,10,11,23,14,15,24,25,18}'),
(6, '{16,26,10,27,28,29,30}'),
(7, '{16,31,32,10,33,34,30}'),
(8, '{32,35,36,37,10,30,19,38,39,29}'),
(9, '{32,35,8,9}') ;
\endcode
-# To perform inference, we call the plda_run() function with the appropriate parameters.
Here is an example.
\code
sql> select MADLIB_SCHEMA.plda_run('MADLIB_SCHEMA.plda_mycorpus', 'MADLIB_SCHEMA.plda_mydict',
'MADLIB_SCHEMA.plda_mymodel', 'MADLIB_SCHEMA.plda_corpus',
30,10,0.5,0.5);
\endcode
After a successful run of the plda_run() function, the most probable words associated
with each topic are displayed. Other results of the learning process can be obtained
by running the following commands. Here we assume the modeltable and outputdatatable
parameters to plda_run() are 'MADLIB_SCHEMA.plda_mymodel' and 'MADLIB_SCHEMA.plda_corpus' respectively.
-# The topic assignments for each document can be obtained as follows:
\code
sql> select id, (topics).topics from MADLIB_SCHEMA.plda_corpus;
\endcode
-# The topic distribution of each document can be obtained as follows:
\code
sql> select id, (topics).topic_d from MADLIB_SCHEMA.plda_corpus;
\endcode
-# The number of times each word was assigned to a given topic in the whole corpus can
be computed as follows:
\code
sql> select ss.i, MADLIB_SCHEMA.plda_word_topic_distrn(gcounts,$numtopics,ss.i)
from MADLIB_SCHEMA.plda_mymodel,
(select generate_series(1,$dictsize) i) as ss;
\endcode
where $numtopics is the number of topics used in the learning process, and
$dictsize is the size of the dictionary.
-# The total number of words assigned to each topic in the whole corpus can be computed
as follows:
\code
sql> select sum((topics).topic_d) topic_sums from MADLIB_SCHEMA.plda_corpus;
\endcode
-# To use a learned LDA model to label new documents, we can use the following commands:
\code
sql> select MADLIB_SCHEMA.plda_label_test_documents('MADLIB_SCHEMA.plda_mycorpus', 'MADLIB_SCHEMA.plda_testresult', 'MADLIB_SCHEMA.plda_mymodel', 'MADLIB_SCHEMA.plda_mydict', 10,0.5,0.5);
sql> select * from MADLIB_SCHEMA.plda_testresult;
\endcode
@literature
[1] D.M. Blei, A.Y. Ng, M.I. Jordan, <em>Latent Dirichlet Allocation</em>,
Journal of Machine Learning Research, vol. 3, pp. 993-1022, 2003.
[2] T. Griffiths and M. Steyvers, <em>Finding scientific topics</em>,
PNAS, vol. 101, pp. 5228-5235, 2004.
[3] Y. Wang, H. Bai, M. Stanton, W-Y. Chen, and E.Y. Chang, <em>PLDA:
Parallel Dirichlet Allocation for Large-scale Applications</em>, AAIM, 2009.
[4] http://en.wikipedia.org/wiki/Latent_Dirichlet_allocation
[5] J. Chang, Collapsed Gibbs sampling methods for topic models, R manual, 2010.
@sa File plda.sql_in documenting the SQL functions.
*/
-- The plda_topics_t data type store the assignment of topics to each word in a document,
-- plus the distribution of those topics in the document.
CREATE TYPE MADLIB_SCHEMA.plda_topics_t AS (
topics int4[],
topic_d int4[]
);
-- Returns a zero'd array of a given dimension
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.plda_zero_array(d int4) RETURNS int4[]
AS 'MODULE_PATHNAME', 'zero_array' LANGUAGE C STRICT;
-- Returns the element-wise sum of two integer arrays
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.plda_sum_int4array(int4[],int4[]) RETURNS int4[]
AS 'MODULE_PATHNAME', 'sum_int4array' LANGUAGE C;
-- Aggregate function for computing the element-wise sum of a set of integer arrays
CREATE AGGREGATE MADLIB_SCHEMA.plda_sum_int4array_agg(int4[])
(
sfunc = MADLIB_SCHEMA.plda_sum_int4array,
stype = int4[]
);
-- Returns an array of random topic assignments for a given document length
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.plda_random_topics(doclen int4, numtopics int4) RETURNS MADLIB_SCHEMA.plda_topics_t
AS 'MODULE_PATHNAME', 'randomTopics' LANGUAGE C STRICT;
-- This function assigns a randomly chosen topic to each word in a document according to
-- the count statistics obtained for the document and the whole corpus so far.
-- Parameters
-- doc : the document to be analysed
-- topics : the topic of each word in the doc
-- topic_d : the topic distribution for the doc
-- global_count : the global word-topic counts
-- topic_counts : the counts of all words in the corpus in each topic
-- num_topics : number of topics to be discovered
-- dsize : size of dictionary
-- alpha : the parameter of the Dirichlet distribution
-- eta : the parameter of the Dirichlet distribution
--
CREATE OR REPLACE FUNCTION
MADLIB_SCHEMA.plda_sample_new_topics(doc int4[], topics int4[], topic_d int4[], global_count int4[],
topic_counts int4[], num_topics int4, dsize int4, alpha float, eta float)
RETURNS MADLIB_SCHEMA.plda_topics_t
AS 'MODULE_PATHNAME', 'sampleNewTopics' LANGUAGE C STRICT;
-- Computes the per document word-topic counts
CREATE OR REPLACE FUNCTION
MADLIB_SCHEMA.plda_cword_count(mystate int4[], doc int4[], topics int4[], doclen int4, num_topics int4, dsize int4)
RETURNS int4[]
AS 'MODULE_PATHNAME', 'cword_count' LANGUAGE C;
-- Aggregate function to compute all word-topic counts given topic assignments for each document
CREATE AGGREGATE MADLIB_SCHEMA.plda_cword_agg(int4[], int4[], int4, int4, int4) (
sfunc = MADLIB_SCHEMA.plda_cword_count,
stype = int4[]
);
-- The main parallel LDA learning function
CREATE OR REPLACE FUNCTION
MADLIB_SCHEMA.plda_train(num_topics int4, num_iter int4, alpha float, eta float,
data_table text, dict_table text, model_table text, output_data_table text)
RETURNS int4 AS $$
PythonFunctionBodyOnly(`plda', `plda')
# MADlibSchema comes from PythonFunctionBodyOnly
return plda.plda_train( MADlibSchema, num_topics, num_iter, alpha, eta, data_table, dict_table, model_table, output_data_table)
$$ LANGUAGE plpythonu;
CREATE TYPE MADLIB_SCHEMA.plda_word_weight AS ( word text, prob float, wcount int4 );
-- Returns the most important words for each topic, base on Pr( word | topic ).
CREATE OR REPLACE FUNCTION
MADLIB_SCHEMA.plda_topic_word_prob(num_topics int4, topic int4, model_table text, dict_table text)
RETURNS SETOF MADLIB_SCHEMA.plda_word_weight AS $$
PythonFunctionBodyOnly(`plda', `plda')
# MADlibSchema comes from PythonFunctionBodyOnly
return plda.plda_topic_word_prob( MADlibSchema, num_topics, topic, model_table, dict_table)
$$ LANGUAGE plpythonu;
CREATE TYPE MADLIB_SCHEMA.plda_word_distrn AS ( word text, distrn int4[], prob float8[] );
-- This function computes the topic assignments to words in a document given previously computed
-- statistics from the training corpus.
-- This function has not been rewritten in PL/Python because it is hard to handle structured types
-- like MADLIB_SCHEMA.plda_topics_t in PL/Python (this has to be done via conversion to text, which is not nice).
CREATE OR REPLACE FUNCTION
MADLIB_SCHEMA.plda_label_document(doc int4[], global_count int4[], topic_counts int4[], num_topics int4, dsize int4,
alpha float, eta float)
RETURNS MADLIB_SCHEMA.plda_topics_t AS $$
DECLARE
ret MADLIB_SCHEMA.plda_topics_t;
BEGIN
ret := MADLIB_SCHEMA.plda_random_topics(array_upper(doc,1), num_topics);
FOR i in 1..20 LOOP
ret := MADLIB_SCHEMA.plda_sample_new_topics(doc,(ret).topics,(ret).topic_d,global_count,topic_counts,num_topics,dsize,alpha,eta);
END LOOP;
RETURN ret;
END;
$$ LANGUAGE plpgsql;
-- This function computes the topic assignments to documents in a test corpus.
-- The data_table argument appears unnecessary, as long as topic_counts is saved in a table from the plda_train() routine
/**
* @brief This function computes the topic assignments to documents in a test corpus.
*
* @param test_table Name of table containing the test corpus (must have columns <tt>id INT</tt> and <tt>contents INT[]</tt>).
* @param output_table Name of table to store the results.
* @param model_table Name of table where learned model is stored (in the form of word-topic counts and topic counts).
* @param dict_table Name of the table containing the dictionary (mandatory column is dict text[])
* @param num_topics Number of topics to discover.
* @param alpha Parameter to the topic Dirichlet prior.
* @param eta Parameter to the Dirichlet prior on the per-topic word distributions.
*
*/
CREATE OR REPLACE FUNCTION
MADLIB_SCHEMA.plda_label_test_documents(test_table text, output_table text, model_table text, dict_table text, num_topics int4, alpha float, eta float)
RETURNS VOID AS $$
PythonFunctionBodyOnly(`plda', `plda')
# MADlibSchema comes from PythonFunctionBodyOnly
plda.plda_label_test_documents( MADlibSchema, test_table, output_table, model_table, dict_table, num_topics, alpha, eta)
$$ LANGUAGE plpythonu;
-- Returns the distribution of topics for a given word
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.plda_word_topic_distrn(arr int4[], ntopics int4, word int4, OUT ret int4[]) AS $$
SELECT $1[(($3-1)*$2 + 1):(($3-1)*$2 + $2)];
$$ LANGUAGE sql;
/**
* @brief Main plda function
*
* @param datatable Name of table containing the corpus (must have columns <tt>id INT</tt> and <tt>contents INT[]</tt>).
* @param dicttable Name of table containing the dictionary (must have column \c dict \c TEXT[]).
* @param modeltable Name of table where learned model will be stored (in the form of word-topic counts and topic counts).
* @param outputdatatable Name of the table the system will store a copy of the datatable plus topic assignments.
* @param numiter Number of iterations to run the Gibbs sampling.
* @param numtopics Number of topics to discover.
* @param alpha Parameter to the topic Dirichlet prior.
* @param eta Parameter to the Dirichlet prior on the per-topic word distributions.
*
*/
CREATE OR REPLACE FUNCTION
MADLIB_SCHEMA.plda_run(datatable text, dicttable text, modeltable text, outputdatatable text,
numiter int4, numtopics int4, alpha float, eta float)
RETURNS VOID AS $$
PythonFunctionBodyOnly(`plda', `plda')
# MADlibSchema comes from PythonFunctionBodyOnly
plda.plda_run( MADlibSchema, datatable, dicttable, modeltable, outputdatatable, numiter, numtopics, alpha, eta)
$$ LANGUAGE plpythonu;