Documentation: Verify docs in new modules
Changes:
- Changed desc-stats to grp_desc_stats to move the
description information into the right place
- Added grp_summary to mainpage.dox.in and formated summary
documentation
- Removed 'Parallel Latent Dirichlet Allocation' from mainpage.dox.in
and added usage for computing perplexity in LDA
- Updated gppkg version to 1.3
diff --git a/deploy/gppkg/CMakeLists.txt b/deploy/gppkg/CMakeLists.txt
index 5dba391..f23967e 100644
--- a/deploy/gppkg/CMakeLists.txt
+++ b/deploy/gppkg/CMakeLists.txt
@@ -2,7 +2,7 @@
# Packaging for Greenplum's gppkg
# ------------------------------------------------------------------------------
-set(MADLIB_GPPKG_VERSION "1.2")
+set(MADLIB_GPPKG_VERSION "1.3")
set(MADLIB_GPPKG_RELEASE_NUMBER 1)
set(MADLIB_GPPKG_RPM_SOURCE_DIR
"${CMAKE_BINARY_DIR}/_CPack_Packages/Linux/RPM/${CPACK_PACKAGE_FILE_NAME}"
diff --git a/doc/mainpage.dox.in b/doc/mainpage.dox.in
index 0185898..b7aadcb 100644
--- a/doc/mainpage.dox.in
+++ b/doc/mainpage.dox.in
@@ -59,8 +59,8 @@
@defgroup grp_unsuplearn Unsupervised Learning
@ingroup grp_modeling
- @defgroup grp_assoc_rules Association Rules
- @ingroup grp_unsuplearn
+ @defgroup grp_assoc_rules Association Rules
+ @ingroup grp_unsuplearn
@defgroup grp_kmeans k-Means Clustering
@ingroup grp_unsuplearn
@@ -71,12 +71,9 @@
@defgroup grp_svdmf SVD Matrix Factorisation
@ingroup grp_unsuplearn
- @defgroup grp_plda Parallel Latent Dirichlet Allocation
+ @defgroup grp_lda Latent Dirichlet Allocation
@ingroup grp_unsuplearn
- @defgroup grp_lda Latent Dirichlet Allocation
- @ingroup grp_unsuplearn
-
@defgroup grp_desc_stats Descriptive Statistics
@defgroup grp_sketches Sketch-based Estimators
@@ -94,6 +91,9 @@
@defgroup grp_profile Profile
@ingroup grp_desc_stats
+ @defgroup grp_summary Summary
+ @ingroup grp_desc_stats
+
@defgroup grp_quantile Quantile
@ingroup grp_desc_stats
diff --git a/methods/sketch/src/pg_gp/sketch_support.c b/methods/sketch/src/pg_gp/sketch_support.c
index 2184baa..22641c3 100644
--- a/methods/sketch/src/pg_gp/sketch_support.c
+++ b/methods/sketch/src/pg_gp/sketch_support.c
@@ -5,7 +5,7 @@
*/
-/*! @addtogroup desc-stats
+/*! @addtogroup grp_desc_stats
*
* @par About:
* MADlib provides a number of descriptive statistics to complement
@@ -29,7 +29,7 @@
*/
/*
-@addtogroup sketches
+@addtogroup grp_sketches
*/
/* THIS CODE MAY NEED TO BE REVISITED TO ENSURE ALIGNMENT! */
diff --git a/src/ports/postgres/modules/lda/lda.sql_in b/src/ports/postgres/modules/lda/lda.sql_in
index e357b82..b3fcd8c 100644
--- a/src/ports/postgres/modules/lda/lda.sql_in
+++ b/src/ports/postgres/modules/lda/lda.sql_in
@@ -84,9 +84,15 @@
@usage
- The training (i.e. topic inference) can be done with the following function:
<pre>
- SELECT \ref lda_train(<em>'data_table'</em>, <em>'model_table'</em>,
- <em>'output_data_table'</em>, <em>voc_size</em>, <em>topic_num</em>,
- <em>iter_num</em>, <em>alpha</em>, <em>beta</em>)
+ SELECT \ref lda_train(
+ <em>'data_table'</em>,
+ <em>'model_table'</em>,
+ <em>'output_data_table'</em>,
+ <em>voc_size</em>,
+ <em>topic_num</em>,
+ <em>iter_num</em>,
+ <em>alpha</em>,
+ <em>beta</em>)
</pre>
This function stores the resulting model in <tt><em>model_table</em></tt>.
@@ -114,8 +120,10 @@
- The prediction (i.e. labelling of test documents using a learned LDA model)
can be done with the following function:
<pre>
- SELECT \ref lda_predict(<em>'data_table'</em>, <em>'model_table'</em>,
- <em>'output_table'</em>);
+ SELECT \ref lda_predict(
+ <em>'data_table'</em>,
+ <em>'model_table'</em>,
+ <em>'output_table'</em>);
</pre>
This function stores the prediction results in
@@ -131,6 +139,13 @@
<em>topic_assignment</em> INTEGER[])
</pre>
+- This module also provides a function for computing the perplexity:
+ <pre>
+ SELECT \ref lda_get_perplexity(
+ <em>'model_table'</em>,
+ <em>'output_data_table'</em>);
+ </pre>
+
@implementation
The input format for this module is very common in many machine learning
packages written in various lanugages, which allows users to generate
@@ -206,8 +221,8 @@
- To perform training, we call the lda_train() function with the
appropriate parameters. Here is an example.
\code
- SELECT MADLib.lda_train('my_training', 'my_model', 'my_outdata', 20, 5,
- 10, 5, 0.01);
+ SELECT MADLib.lda_train(
+ 'my_training', 'my_model', 'my_outdata', 20, 5, 10, 5, 0.01);
\endcode
After a successful run of the lda_train() function, two tables will be
@@ -219,20 +234,20 @@
- The topic description by top-k words
\code
- SELECT * FROM MADLib.lda_get_topic_desc('my_model', 'my_vocab',
- 'my_topic_desc', 15);
+ SELECT * FROM MADLib.lda_get_topic_desc(
+ 'my_model', 'my_vocab', 'my_topic_desc', 15);
\endcode
- The per-topic word counts
\code
- SELECT MADLib.lda_get_topic_word_count('my_model',
- 'my_topic_word_count');
+ SELECT MADLib.lda_get_topic_word_count(
+ 'my_model', 'my_topic_word_count');
\endcode
- The per-word topic counts
\code
- SELECT MADLib.lda_get_word_topic_count('my_model',
- 'my_word_topic_count');
+ SELECT MADLib.lda_get_word_topic_count(
+ 'my_model', 'my_word_topic_count');
\endcode
To get the topic counts and the topic assignments for each doucment, we
@@ -240,19 +255,24 @@
- The per-document topic counts:
\code
- SELECT docid, topic_count FROM my_outdata;
+ SELECT
+ docid, topic_count
+ FROM my_outdata;
\endcode
- The per-document topic assignments:
\code
- SELECT docid, words, counts, topic_assignment FROM my_outdata;
+ SELECT
+ docid, words, counts, topic_assignment
+ FROM my_outdata;
\endcode
By scanning \c words, \c counts, and \c topic_assignment together, we can
get the topic assignment for each word in a document.
-- To use a learned LDA model for prediction (i.e. to label new documents), we can use the following commands:
+- To use a learned LDA model for prediction (i.e. to label new documents), we can use the following command:
\code
- SELECT MADLib.lda_predict('my_testing', 'my_model', 'my_pred');
+ SELECT MADLib.lda_predict(
+ 'my_testing', 'my_model', 'my_pred');
\endcode
After a successful run of the lda_predict() function, the prediction
@@ -265,16 +285,26 @@
- The per-document topic counts:
\code
- SELECT docid, topic_count FROM my_pred;
+ SELECT
+ docid, topic_count
+ FROM my_pred;
\endcode
- The per-document topic assignments:
\code
- SELECT docid, words, counts, topic_assignment FROM my_pred;
+ SELECT
+ docid, words, counts, topic_assignment
+ FROM my_pred;
\endcode
By scanning \c words, \c counts, and \c topic_assignment together, we can
get the topic assignment for each word in a document.
-
+
+- To compute the perplexity, we can use the following command:
+ \code
+ SELECT MADLib.lda_get_perplexity(
+ 'my_model', 'my_pred');
+ \endcode
+
@literature
[1] D.M. Blei, A.Y. Ng, M.I. Jordan, <em>Latent Dirichlet Allocation</em>,
diff --git a/src/ports/postgres/modules/summary/summary.sql_in b/src/ports/postgres/modules/summary/summary.sql_in
index 538db32..15706c3 100644
--- a/src/ports/postgres/modules/summary/summary.sql_in
+++ b/src/ports/postgres/modules/summary/summary.sql_in
@@ -22,7 +22,6 @@
@usage
The summary function can be invoked in the following way:
@verbatim
------------------------------------------------------------------------
SELECT MADLIB_SCHEMA.summary
(
source_table TEXT, -- Source table name (Required)
@@ -41,7 +40,6 @@
-- (Default: 10)
get_estimates BOOLEAN -- Should we produce exact or estimated values?
) -- (Default: True)
------------------------------------------------------------------------
@endverbatim
Note:
@@ -63,10 +61,10 @@
row_count INT4, -- Number of rows in the output table
duration FLOAT8 -- Time taken (in seconds) to compute the summary
-The summary stastics are stored in the 'output_table' relation provided in the
-arguments. The relation 'output_table' can contain the following table
+The summary stastics are stored in the 'output_table' relation provided in the
+arguments. The relation 'output_table' can contain the following table
(presence of some columns depends on the argument values)
------------------------------------------------------------------------
+@verbatim
- group_by_column : Group-by column names (NULL if none provided)
- group_by_value : Values of the Group-by column (NULL if no grouping)
- target_column : Targeted column values for which summary is requested
@@ -88,7 +86,7 @@
- quantile_array : Percentile values corresponding to ntile_array
- most_frequent_values : Most frequent values
- mfv_frequencies : Frequency of the most frequent values
------------------------------------------------------------------------
+@endverbatim
The output can be obtained as
@verbatim