Documentation: Verify docs in new modules Changes: - Changed desc-stats to grp_desc_stats to move the description information into the right place - Added grp_summary to mainpage.dox.in and formated summary documentation - Removed 'Parallel Latent Dirichlet Allocation' from mainpage.dox.in and added usage for computing perplexity in LDA - Updated gppkg version to 1.3

commit: 11b81c31dbf1c5f4d87ad3d1e6672dd526ceb1b2 [log] [tgz]
author: Rahul Iyer <rahul.iyer@emc.com> Tue Apr 02 11:00:04 2013 -0700
committer: Rahul Iyer <rahul.iyer@emc.com> Tue Apr 02 11:04:22 2013 -0700
tree: 3e8ac03bdfff7f89656793cf4d15298822ec6540
parent: f88edc34874c84cc8b58f2c51d2756d72bab8a09 [diff]
diff --git a/deploy/gppkg/CMakeLists.txt b/deploy/gppkg/CMakeLists.txt
index 5dba391..f23967e 100644
--- a/deploy/gppkg/CMakeLists.txt
+++ b/deploy/gppkg/CMakeLists.txt

@@ -2,7 +2,7 @@
 # Packaging for Greenplum's gppkg
 # ------------------------------------------------------------------------------
 
-set(MADLIB_GPPKG_VERSION "1.2")
+set(MADLIB_GPPKG_VERSION "1.3")
 set(MADLIB_GPPKG_RELEASE_NUMBER 1)
 set(MADLIB_GPPKG_RPM_SOURCE_DIR
     "${CMAKE_BINARY_DIR}/_CPack_Packages/Linux/RPM/${CPACK_PACKAGE_FILE_NAME}"

diff --git a/doc/mainpage.dox.in b/doc/mainpage.dox.in
index 0185898..b7aadcb 100644
--- a/doc/mainpage.dox.in
+++ b/doc/mainpage.dox.in

@@ -59,8 +59,8 @@
     @defgroup grp_unsuplearn Unsupervised Learning
     @ingroup grp_modeling
 
-        @defgroup grp_assoc_rules Association Rules
-        @ingroup grp_unsuplearn
+    @defgroup grp_assoc_rules Association Rules
+    @ingroup grp_unsuplearn
 
     @defgroup grp_kmeans k-Means Clustering
     @ingroup grp_unsuplearn
@@ -71,12 +71,9 @@
     @defgroup grp_svdmf SVD Matrix Factorisation
     @ingroup grp_unsuplearn
 
-    @defgroup grp_plda Parallel Latent Dirichlet Allocation
+    @defgroup grp_lda Latent Dirichlet Allocation
     @ingroup grp_unsuplearn
 
-        @defgroup grp_lda Latent Dirichlet Allocation
-        @ingroup grp_unsuplearn
-
 @defgroup grp_desc_stats Descriptive Statistics
 
     @defgroup grp_sketches Sketch-based Estimators
@@ -94,6 +91,9 @@
     @defgroup grp_profile Profile
     @ingroup grp_desc_stats
 
+    @defgroup grp_summary Summary
+    @ingroup grp_desc_stats
+
     @defgroup grp_quantile Quantile
     @ingroup grp_desc_stats
 

diff --git a/methods/sketch/src/pg_gp/sketch_support.c b/methods/sketch/src/pg_gp/sketch_support.c
index 2184baa..22641c3 100644
--- a/methods/sketch/src/pg_gp/sketch_support.c
+++ b/methods/sketch/src/pg_gp/sketch_support.c

@@ -5,7 +5,7 @@
  */
 
 
-/*! @addtogroup desc-stats
+/*! @addtogroup grp_desc_stats
  *
  * @par About:
  * MADlib provides a number of descriptive statistics to complement
@@ -29,7 +29,7 @@
  */
 
 /*
-@addtogroup sketches
+@addtogroup grp_sketches
 */
 /* THIS CODE MAY NEED TO BE REVISITED TO ENSURE ALIGNMENT! */
 

diff --git a/src/ports/postgres/modules/lda/lda.sql_in b/src/ports/postgres/modules/lda/lda.sql_in
index e357b82..b3fcd8c 100644
--- a/src/ports/postgres/modules/lda/lda.sql_in
+++ b/src/ports/postgres/modules/lda/lda.sql_in

@@ -84,9 +84,15 @@
 @usage
 - The training (i.e. topic inference) can be done with the following function:
     <pre>
-        SELECT \ref lda_train(<em>'data_table'</em>, <em>'model_table'</em>,
-        <em>'output_data_table'</em>, <em>voc_size</em>, <em>topic_num</em>,
-        <em>iter_num</em>, <em>alpha</em>, <em>beta</em>)
+        SELECT \ref lda_train(
+            <em>'data_table'</em>,
+            <em>'model_table'</em>,
+            <em>'output_data_table'</em>, 
+            <em>voc_size</em>, 
+            <em>topic_num</em>,
+            <em>iter_num</em>, 
+            <em>alpha</em>, 
+            <em>beta</em>)
     </pre>
     
     This function stores the resulting model in <tt><em>model_table</em></tt>.
@@ -114,8 +120,10 @@
 - The prediction (i.e. labelling of test documents using a learned LDA model)
   can be done with the following function: 
     <pre>
-        SELECT \ref lda_predict(<em>'data_table'</em>, <em>'model_table'</em>,
-        <em>'output_table'</em>);
+        SELECT \ref lda_predict(
+            <em>'data_table'</em>,
+            <em>'model_table'</em>,
+            <em>'output_table'</em>);
     </pre>
     
     This function stores the prediction results in
@@ -131,6 +139,13 @@
         <em>topic_assignment</em> INTEGER[])
     </pre>
 
+- This module also provides a function for computing the perplexity:
+    <pre>
+        SELECT \ref lda_get_perplexity(
+            <em>'model_table'</em>,
+            <em>'output_data_table'</em>);
+    </pre>
+
 @implementation
 The input format for this module is very common in many machine learning
 packages written in various lanugages, which allows users to generate
@@ -206,8 +221,8 @@
 - To perform training, we call the lda_train() function with the
 appropriate parameters. Here is an example.
     \code
-    SELECT MADLib.lda_train('my_training',  'my_model', 'my_outdata', 20, 5,
-    10, 5, 0.01);
+    SELECT MADLib.lda_train(
+        'my_training',  'my_model', 'my_outdata', 20, 5, 10, 5, 0.01);
     \endcode
 
     After a successful run of the lda_train() function, two tables will be
@@ -219,20 +234,20 @@
 
     - The topic description by top-k words
     \code
-        SELECT * FROM MADLib.lda_get_topic_desc('my_model', 'my_vocab',
-        'my_topic_desc', 15);
+        SELECT * FROM MADLib.lda_get_topic_desc(
+            'my_model', 'my_vocab', 'my_topic_desc', 15);
     \endcode
 
     - The per-topic word counts
     \code
-        SELECT MADLib.lda_get_topic_word_count('my_model',
-        'my_topic_word_count');
+        SELECT MADLib.lda_get_topic_word_count(
+            'my_model', 'my_topic_word_count');
     \endcode
 
     - The per-word topic counts
     \code
-        SELECT MADLib.lda_get_word_topic_count('my_model',
-        'my_word_topic_count');
+        SELECT MADLib.lda_get_word_topic_count(
+            'my_model', 'my_word_topic_count');
     \endcode
 
     To get the topic counts and the topic assignments for each doucment, we
@@ -240,19 +255,24 @@
 
     - The per-document topic counts:
     \code
-        SELECT docid, topic_count FROM my_outdata;
+        SELECT 
+            docid, topic_count 
+        FROM my_outdata;
     \endcode
 
     - The per-document topic assignments:
     \code
-        SELECT docid, words, counts, topic_assignment FROM my_outdata;
+        SELECT 
+            docid, words, counts, topic_assignment 
+        FROM my_outdata;
     \endcode
     By scanning \c words, \c counts, and \c topic_assignment together, we can
     get the topic assignment for each word in a document.
 
-- To use a learned LDA model for prediction (i.e. to label new documents), we can use the following commands: 
+- To use a learned LDA model for prediction (i.e. to label new documents), we can use the following command: 
     \code
-    SELECT MADLib.lda_predict('my_testing', 'my_model', 'my_pred');
+    SELECT MADLib.lda_predict(
+        'my_testing', 'my_model', 'my_pred');
     \endcode
 
     After a successful run of the lda_predict() function, the prediction
@@ -265,16 +285,26 @@
 
     - The per-document topic counts:
     \code
-        SELECT docid, topic_count FROM my_pred;
+        SELECT 
+            docid, topic_count 
+        FROM my_pred;
     \endcode
 
     - The per-document topic assignments:
     \code
-        SELECT docid, words, counts, topic_assignment FROM my_pred;
+        SELECT 
+            docid, words, counts, topic_assignment 
+        FROM my_pred;
     \endcode
     By scanning \c words, \c counts, and \c topic_assignment together, we can
     get the topic assignment for each word in a document.
-         
+
+- To compute the perplexity, we can use the following command:
+    \code
+        SELECT MADLib.lda_get_perplexity(
+            'my_model', 'my_pred');
+    \endcode
+
 @literature
 
 [1] D.M. Blei, A.Y. Ng, M.I. Jordan, <em>Latent Dirichlet Allocation</em>,

diff --git a/src/ports/postgres/modules/summary/summary.sql_in b/src/ports/postgres/modules/summary/summary.sql_in
index 538db32..15706c3 100644
--- a/src/ports/postgres/modules/summary/summary.sql_in
+++ b/src/ports/postgres/modules/summary/summary.sql_in

@@ -22,7 +22,6 @@
 @usage
 The summary function can be invoked in the following way:
 @verbatim
------------------------------------------------------------------------
 SELECT MADLIB_SCHEMA.summary
 (
     source_table            TEXT,       -- Source table name (Required)
@@ -41,7 +40,6 @@
                                         --      (Default: 10)
     get_estimates           BOOLEAN     -- Should we produce exact or estimated values?
 )                                       --      (Default: True)
------------------------------------------------------------------------
 @endverbatim
 
 Note:
@@ -63,10 +61,10 @@
     row_count               INT4,       -- Number of rows in the output table
     duration                FLOAT8      -- Time taken (in seconds) to compute the summary 
 
-The summary stastics are stored in the 'output_table' relation provided in the 
-arguments. The relation 'output_table' can contain the following table 
+The summary stastics are stored in the 'output_table' relation provided in the
+arguments. The relation 'output_table' can contain the following table
 (presence of some columns depends on the argument values)
------------------------------------------------------------------------
+@verbatim
     - group_by_column       : Group-by column names (NULL if none provided)
     - group_by_value        : Values of the Group-by column (NULL if no grouping)
     - target_column         : Targeted column values for which summary is requested
@@ -88,7 +86,7 @@
     - quantile_array        : Percentile values corresponding to ntile_array
     - most_frequent_values  : Most frequent values
     - mfv_frequencies       : Frequency of the most frequent values 
------------------------------------------------------------------------
+@endverbatim
 
 The output can be obtained as
 @verbatim
commit	11b81c31dbf1c5f4d87ad3d1e6672dd526ceb1b2	[log] [tgz]
author	Rahul Iyer <rahul.iyer@emc.com>	Tue Apr 02 11:00:04 2013 -0700
committer	Rahul Iyer <rahul.iyer@emc.com>	Tue Apr 02 11:04:22 2013 -0700
tree	3e8ac03bdfff7f89656793cf4d15298822ec6540
parent	f88edc34874c84cc8b58f2c51d2756d72bab8a09 [diff]