Documentation: Fix various inconsistencies in documentation Pivotal Tracker: 58478260 Additional authors: - Hai Qian <hqian@gopivotal.com> - Shengwen Yang <syang@gopivotal.com> - Xixuan Feng <xfeng@gopivotal.com> Changes: - Complete the Release notes - Gppkg version number set to 1.8 - Fix various documentation errors in multiple modules - Changed incorrect function declaration in margins_mlogregr

commit: 24d9fba9173213f7ac77c66a4c557a2917bd19b2 [log] [tgz]
author: Rahul Iyer <riyer@gopivotal.com> Mon Nov 25 12:26:48 2013 -0800
committer: Rahul Iyer <riyer@gopivotal.com> Mon Nov 25 12:26:48 2013 -0800
tree: 6890b0ea2cd7da014d6319ebef478dd50bde02db
parent: 5480d2f4fed556c2da0d0ac8ff4c4352dbb58a23 [diff]
diff --git a/ReleaseNotes.txt b/ReleaseNotes.txt
index ecaf4d5..0394765 100644
--- a/ReleaseNotes.txt
+++ b/ReleaseNotes.txt

@@ -9,6 +9,70 @@
 
 Current list of bugs and issues can be found at http://jira.madlib.net.
 --------------------------------------------------------------------------------
+MADlib v1.4
+
+Release Date: 2013-Nov-25
+
+New Features:
+* Improved interface for Multinomial logistic regression:
+    - Added a new interface that accepts an 'output_table' parameter and
+    stores the model details in the output table instead of returning as a struct
+    data type. The updated function also builds a summary table that includes
+    all parameters and meta-parameters used during model training.
+    - The output table has been reformatted to present the model coefficients
+    and related metrics for each category in a separate row. This replaces the
+    old output format of model stats for all categories combined in a
+    single array.
+* Variance Estimators
+    - Added Robust Variance estimator for Cox PH models (Lin and Wei, 1989).
+    It is useful in calculating variances in a dataset with potentially
+    noisy outliers. Namely, the standard errors are asymptotically normal even
+    if the model is wrong due to outliers.
+    - Added Clustered Variance estimator for Cox PH models. It is used
+    when data contains extra clustering information besides covariates and
+    are asymptotically normal estimates.
+* NULL Handling:
+    - Modified behavior of regression modules to 'omit' rows containing NULL
+    values for any of the dependent and independent variables. The number of
+    rows skipped is provided as part of the output table.
+    This release includes NULL handling for following modules:
+        - Linear, Logistic, and Multinomial logistic regression, as well as
+        Cox Proportional Hazards
+        - Huber-White sandwich estimators for linear, logistic, and multinomial
+        logistic regression as well as Cox Proportional Hazards
+        - Clustered variance estimators for linear, logistic, and multinomial
+        logistic regression as well as Cox Proportional Hazards
+        - Marginal effects for logistic and multinomial logistic regression
+
+Deprecated functions:
+    - Multinomial logistic regression function has been renamed to
+    'mlogregr_train'. Old function ('mlogregr') has been deprecated,
+    and will be removed in the next major version update.
+
+    - For all multinomial regression estimator functions (list given below),
+    changes in the argument list were made to collate all optimizer specific
+    arguments in a single string. An example of the new optimizer parameter is
+    'max_iter=20, optimizer=irls, precision=0.0001'.
+    This is in contrast to the original argument list that contained 3 arguments:
+    'max_iter', 'optimizer', and 'precision'. This change allows adding new
+    optimizer-specific parameters without changing the argument list.
+    Affected functions:
+        - robust_variance_mlogregr
+        - clustered_variance_mlogregr
+        - margins_mlogregr
+
+Bug Fixes:
+    - Fixed an overflow problem in LDA by using INT64 instead of INT32.
+    - Fixed integer to boolean cast bug in clustered variance for logistic
+    regression. After this fix, integer columns are accepted for binary
+    dependent variable using the 'integer to bool' cast rules.
+    - Fixed two bugs in SVD:
+        - The 'example' option for online help has been fixed
+        - Column names for sparse input tables in the 'svd_sparse' and
+        'svd_sparse_native' functions are no longer restricted to 'row_id',
+        'col_id' and 'value'.
+
+--------------------------------------------------------------------------------
 MADlib v1.3
 
 Release Date: 2013-October-03

diff --git a/deploy/gppkg/CMakeLists.txt b/deploy/gppkg/CMakeLists.txt
index 14bd153..4856771 100644
--- a/deploy/gppkg/CMakeLists.txt
+++ b/deploy/gppkg/CMakeLists.txt

@@ -2,8 +2,7 @@
 # Packaging for Greenplum's gppkg
 # ------------------------------------------------------------------------------
 
-# set(MADLIB_GPPKG_VERSION "ossv1.4_pv1.7.2_gpdb4.2")
-set(MADLIB_GPPKG_VERSION "1.7.2")
+set(MADLIB_GPPKG_VERSION "1.8")
 set(MADLIB_GPPKG_RELEASE_NUMBER 1)
 set(MADLIB_GPPKG_RPM_SOURCE_DIR
     "${CMAKE_BINARY_DIR}/_CPack_Packages/Linux/RPM/${CPACK_PACKAGE_FILE_NAME}"

diff --git a/src/ports/postgres/modules/regress/clustered_variance.py_in b/src/ports/postgres/modules/regress/clustered_variance.py_in
index 924067d..8638840 100644
--- a/src/ports/postgres/modules/regress/clustered_variance.py_in
+++ b/src/ports/postgres/modules/regress/clustered_variance.py_in

@@ -41,7 +41,7 @@
     Create the SQL query to execute and create the result table
     """
     sqlPart1 = """
-    create table {tbl_output} as
+    create table {out_table} as
         select (f).* from (
             select {schema_madlib}.__clustered_{regr_type}_compute_stats(
                 max(coef),
@@ -59,7 +59,7 @@
                                 ({indvar})::DOUBLE PRECISION[],
                                 ({coef_str})::DOUBLE PRECISION[],
                                 (select count(distinct {depvar})
-                                 from {tbl_data})::INTEGER,
+                                 from {source_table})::INTEGER,
                                 ({ref_category})::INTEGER) as g,""".format(**kwargs)
     elif kwargs['regr_type'] == 'log':
         sqlPart2 = """{schema_madlib}.__clustered_err_{regr_type}_step(
@@ -77,7 +77,7 @@
                 FROM (
                       SELECT u.coef as {coef_str}, v.*
                       FROM
-                        {fitres} u, {tbl_data} v
+                        {fitres} u, {source_table} v
                       where
                          ({indvar}) is not NULL and
                          {schema_madlib}.array_contains_null({indvar}) is False and
@@ -90,7 +90,7 @@
 # ========================================================================
 
 
-def clustered_variance_linregr(schema_madlib, tbl_data, tbl_output,
+def clustered_variance_linregr(schema_madlib, source_table, out_table,
                                depvar, indvar, clustervar, grouping_col,
                                **kwargs):
     """
@@ -100,8 +100,8 @@
                                  name='client_min_messages'")[0]['setting']
     plpy.execute("set client_min_messages to error")
 
-    validate_args_clustered_variance_linregr(schema_madlib, tbl_data,
-                                             tbl_output, depvar, indvar,
+    validate_args_clustered_variance_linregr(schema_madlib, source_table,
+                                             out_table, depvar, indvar,
                                              clustervar, grouping_col)
 
     (fitres, coef_str, cluster_grouping_str,
@@ -109,28 +109,28 @@
 
     plpy.execute(
         """
-        select {schema_madlib}.linregr_train('{tbl_data}',
+        select {schema_madlib}.linregr_train('{source_table}',
             '{fitres}', '{depvar}', '{indvar}', {grouping_col})
-        """.format(schema_madlib=schema_madlib, tbl_data=tbl_data,
+        """.format(schema_madlib=schema_madlib, source_table=source_table,
                    fitres=fitres, depvar=depvar, indvar=indvar,
                    grouping_col=grouping_col_str))
 
     plpy.execute(
         """
-        alter table {fitres}_summary rename to {tbl_output}_summary;
-        """.format(fitres=fitres, tbl_output=tbl_output))
+        alter table {fitres}_summary rename to {out_table}_summary;
+        """.format(fitres=fitres, out_table=out_table))
 
     plpy.execute(
         """
-        update {tbl_output}_summary set out_table = '{tbl_output}'
-        """.format(tbl_output=tbl_output))
+        update {out_table}_summary set out_table = '{out_table}'
+        """.format(out_table=out_table))
 
     plpy.execute(
         __generate_clustered_sql(schema_madlib=schema_madlib, depvar=depvar,
                                  indvar=indvar, coef_str=coef_str,
-                                 tbl_data=tbl_data, fitres=fitres,
+                                 source_table=source_table, fitres=fitres,
                                  cluster_grouping_str=cluster_grouping_str,
-                                 tbl_output=tbl_output, regr_type="lin"))
+                                 out_table=out_table, regr_type="lin"))
 
     plpy.execute("drop table if exists {fitres}".format(fitres=fitres))
 
@@ -139,25 +139,25 @@
 # ========================================================================
 
 
-def validate_args_clustered_variance(schema_madlib, tbl_data,
-                                     tbl_output, depvar, indvar,
+def validate_args_clustered_variance(schema_madlib, source_table,
+                                     out_table, depvar, indvar,
                                      clustervar, grouping_col):
     """
     Validate the parameters
     """
-    if not tbl_data or tbl_data.lower() in ('null', '') or not table_exists(tbl_data):
+    if not source_table or source_table.lower() in ('null', '') or not table_exists(source_table):
         plpy.error("Clustered variance estimation error: Data table does not exist!")
 
-    if table_is_empty(tbl_data):
+    if table_is_empty(source_table):
         plpy.error("Clustered variance estimation error: Data table is empty!")
 
-    if tbl_output is None or tbl_output.lower() in ('null', ''):
+    if out_table is None or out_table.lower() in ('null', ''):
         plpy.error("Clustered variance estimation error: Invalid output table name!")
 
-    if table_exists(tbl_output):
+    if table_exists(out_table):
         plpy.error("Clustered variance estimation error: Output table exists!")
 
-    if table_exists(tbl_output + "_summary"):
+    if table_exists(out_table + "_summary"):
         plpy.error("Clustered variance estimation error: Output summary table exists!")
 
     if depvar is None or (not isinstance(depvar, str)) or depvar.lower() in ('null', ''):
@@ -183,7 +183,7 @@
     #     plpy.error("Clustered variance estimation error: dependent variable is a constant!")
 
     # try:
-    #     plpy.execute("select {indvar} from {tbl} limit 1".format(indvar = indvar, tbl = tbl_data))
+    #     plpy.execute("select {indvar} from {tbl} limit 1".format(indvar = indvar, tbl = source_table))
     #     success = True
     # except:
     #     success = False
@@ -191,14 +191,14 @@
     #     plpy.error("Clustered variance estimation error: independent variable does not exist in the data table!")
 
     # try:
-    #     plpy.execute("select {depvar} from {tbl} limit 1".format(depvar = depvar, tbl = tbl_data))
+    #     plpy.execute("select {depvar} from {tbl} limit 1".format(depvar = depvar, tbl = source_table))
     #     success = True
     # except:
     #     success = False
     # if not success:
     #     plpy.error("Clustered variance estimation error: dependent variable does not exist in the data table!")
 
-    # if not scalar_col_has_no_null(tbl_data, depvar):
+    # if not scalar_col_has_no_null(source_table, depvar):
     #     plpy.error("Clustered variance estimation error: Dependent variable has Null values! \
     #                 Please filter out Null values before using this function!")
 
@@ -207,7 +207,7 @@
         plpy.error("Clustered variance estimation error: Invalid cluster columns name!")
 
     if clustervar is not None:
-        if not columns_exist_in_table(tbl_data,
+        if not columns_exist_in_table(source_table,
                                       _string_to_array(clustervar),
                                       schema_madlib):
             plpy.error("Clustered variance estimation error: Cluster column does not exist!")
@@ -217,7 +217,7 @@
         plpy.error("Clustered variance estimation error: Invalid grouping columns name!")
 
     if grouping_col:
-        if not columns_exist_in_table(tbl_data,
+        if not columns_exist_in_table(source_table,
                                       _string_to_array(grouping_col),
                                       schema_madlib):
             plpy.error("Clustered variance estimation error: Grouping column does not exist!")
@@ -225,14 +225,14 @@
 
 # ========================================================================
 
-def validate_args_clustered_variance_linregr(schema_madlib, tbl_data,
-                                             tbl_output, depvar, indvar,
+def validate_args_clustered_variance_linregr(schema_madlib, source_table,
+                                             out_table, depvar, indvar,
                                              clustervar, grouping_col):
     """
     Validate the parameters
     """
-    validate_args_clustered_variance(schema_madlib, tbl_data,
-                                     tbl_output, depvar, indvar,
+    validate_args_clustered_variance(schema_madlib, source_table,
+                                     out_table, depvar, indvar,
                                      clustervar, grouping_col)
 # ========================================================================
 
@@ -253,8 +253,8 @@
         the linear regression.
 
         SELECT {schema_madlib}.clustered_variance_linregr(
-            'tbl_data',
-            'tbl_output',
+            'source_table',
+            'out_table',
             'depvar',
             'indvar',
             'clustervar',
@@ -273,8 +273,8 @@
         Usage:
         ----------------------------------------------------------------
         SELECT {schema_madlib}.clustered_variance_linregr(
-            'tbl_data',      -- Name of data table
-            'tbl_output',    -- Name of result table (raise an error if it already exists)
+            'source_table',      -- Name of data table
+            'out_table',    -- Name of result table (raise an error if it already exists)
             'depvar',        -- Expression for dependent variable
             'indvar',        -- Expression for independent variables
             'clustervar',    -- Column names for cluster variables, separated by comma
@@ -298,7 +298,7 @@
 # ========================================================================
 
 
-def clustered_variance_logregr(schema_madlib, tbl_data, tbl_output,
+def clustered_variance_logregr(schema_madlib, source_table, out_table,
                                depvar, indvar, clustervar, grouping_col,
                                max_iter, optimizer, tolerance, verbose_mode,
                                **kwargs):
@@ -308,8 +308,8 @@
     if optimizer is not None and optimizer.lower() == 'newton':
         optimizer = 'irls'
 
-    validate_args_clustered_variance_logregr(schema_madlib, tbl_data,
-                                             tbl_output, depvar, indvar,
+    validate_args_clustered_variance_logregr(schema_madlib, source_table,
+                                             out_table, depvar, indvar,
                                              clustervar, grouping_col,
                                              max_iter, optimizer, tolerance,
                                              verbose_mode)
@@ -320,10 +320,10 @@
     plpy.execute(
         """
         SELECT {schema_madlib}.logregr_train(
-            '{tbl_data}', '{fitres}', '{depvar}', '{indvar}',
+            '{source_table}', '{fitres}', '{depvar}', '{indvar}',
             {grouping_col}, {max_iter}, '{optimizer}', {tolerance},
             {verbose})
-        """.format(schema_madlib=schema_madlib, tbl_data=tbl_data,
+        """.format(schema_madlib=schema_madlib, source_table=source_table,
                    fitres=fitres, depvar=depvar, indvar=indvar,
                    grouping_col=grouping_col_str, max_iter=max_iter,
                    optimizer=optimizer, tolerance=tolerance,
@@ -338,20 +338,20 @@
 
     plpy.execute(
         """
-        alter table {fitres}_summary rename to {tbl_output}_summary;
-        """.format(fitres=fitres, tbl_output=tbl_output))
+        alter table {fitres}_summary rename to {out_table}_summary;
+        """.format(fitres=fitres, out_table=out_table))
 
     plpy.execute(
         """
-        update {tbl_output}_summary set out_table = '{tbl_output}'
-        """.format(tbl_output=tbl_output))
+        update {out_table}_summary set out_table = '{out_table}'
+        """.format(out_table=out_table))
 
     plpy.execute(
         __generate_clustered_sql(schema_madlib=schema_madlib, depvar=depvar,
                                  indvar=indvar, coef_str=coef_str,
-                                 tbl_data=tbl_data, fitres=fitres,
+                                 source_table=source_table, fitres=fitres,
                                  cluster_grouping_str=cluster_grouping_str,
-                                 tbl_output=tbl_output, regr_type="log"))
+                                 out_table=out_table, regr_type="log"))
 
     plpy.execute(
         """
@@ -363,16 +363,16 @@
 # ========================================================================
 
 
-def validate_args_clustered_variance_logregr(schema_madlib, tbl_data,
-                                             tbl_output, depvar, indvar,
+def validate_args_clustered_variance_logregr(schema_madlib, source_table,
+                                             out_table, depvar, indvar,
                                              clustervar, grouping_col,
                                              max_iter, optimizer, tolerance,
                                              verbose_mode):
     """
     Validate the parameters
     """
-    validate_args_clustered_variance(schema_madlib, tbl_data,
-                                     tbl_output, depvar, indvar,
+    validate_args_clustered_variance(schema_madlib, source_table,
+                                     out_table, depvar, indvar,
                                      clustervar, grouping_col)
     if max_iter is None or max_iter <= 0:
         plpy.error("Clustered variance estimation error: Maximum number of "
@@ -409,8 +409,8 @@
         the logistic regression.
 
         SELECT {schema_madlib}.clustered_variance_logregr(
-            'tbl_data',
-            'tbl_output',
+            'source_table',
+            'out_table',
             'depvar',
             'indvar',
             'clustervar',
@@ -433,8 +433,8 @@
         Usage:
         ----------------------------------------------------------------
         SELECT {schema_madlib}.clustered_variance_logregr(
-            'tbl_data',      -- Name of data table
-            'tbl_output',    -- Name of result table (raise an error if it already exists)
+            'source_table',      -- Name of data table
+            'out_table',    -- Name of result table (raise an error if it already exists)
             'depvar',        -- Expression for dependent variable
             'indvar',        -- Expression for independent variables
             'clustervar',    -- Column names for cluster variables, separated by comma
@@ -512,7 +512,7 @@
 
     plpy.execute(
         """
-        SELECT 
+        SELECT
             {schema_madlib}.mlogregr_train(
                 '{source_table}', '{fitres}',
                 '{dependent_varname}', '{independent_varname}', {ref_category},
@@ -535,9 +535,9 @@
     plpy.execute("""
         CREATE TEMP TABLE {coef_table} AS
         SELECT
-            {schema_madlib}.matrix_agg(coef ORDER BY category) AS coef 
+            {schema_madlib}.matrix_agg(coef ORDER BY category) AS coef
         FROM {fitres}
-        """.format(schema_madlib=schema_madlib, fitres=fitres, coef_table=coef_table)) 
+        """.format(schema_madlib=schema_madlib, fitres=fitres, coef_table=coef_table))
 
     plpy.execute(
         """
@@ -548,8 +548,8 @@
         __generate_clustered_sql(
             schema_madlib=schema_madlib,
             depvar=dependent_varname, indvar=independent_varname,
-            coef_str=coef_str, tbl_data=source_table, fitres=coef_table,
-            cluster_grouping_str=cluster_grouping_str, tbl_output=out_table,
+            coef_str=coef_str, source_table=source_table, fitres=coef_table,
+            cluster_grouping_str=cluster_grouping_str, out_table=out_table,
             ref_category=ref_category, regr_type="mlog"))
 
     num_categories = plpy.execute(
@@ -561,7 +561,7 @@
         FROM {source_table} LIMIT 1
         """.format(independent_varname=independent_varname, source_table=source_table))[0]['fnum']
 
-    tmp_table = __unique_string() 
+    tmp_table = __unique_string()
     plpy.execute("""
         CREATE TABLE {tmp_table} AS
         SELECT
@@ -574,7 +574,7 @@
                 coef, {num_features},
                 {num_categories}, {ref_category})
             ).coef AS coef,
-            ({schema_madlib}.__mlogregr_format( 
+            ({schema_madlib}.__mlogregr_format(
                 std_err, {num_features},
                 {num_categories}, {ref_category})
             ).coef AS std_err,
@@ -678,8 +678,8 @@
         the multi-logistic regression.
 
         SELECT {schema_madlib}.clustered_variance_mlogregr(
-            'tbl_data',
-            'tbl_output',
+            'source_table',
+            'out_table',
             'depvar',
             'indvar',
             'clustervar',
@@ -703,8 +703,8 @@
         Usage:
         ----------------------------------------------------------------
         SELECT {schema_madlib}.clustered_variance_mlogregr(
-            'tbl_data',      -- Name of data table
-            'tbl_output',    -- Name of result table (raise an error if it already exists)
+            'source_table',      -- Name of data table
+            'out_table',    -- Name of result table (raise an error if it already exists)
             'depvar',        -- Expression for dependent variable
             'indvar',        -- Expression for independent variables
             'clustervar',    -- Column names for cluster variables, separated by comma

diff --git a/src/ports/postgres/modules/regress/clustered_variance.sql_in b/src/ports/postgres/modules/regress/clustered_variance.sql_in
index 71d0f86..c397096 100644
--- a/src/ports/postgres/modules/regress/clustered_variance.sql_in
+++ b/src/ports/postgres/modules/regress/clustered_variance.sql_in

@@ -35,27 +35,54 @@
 
 The clustered variance linear regression training function has the following syntax.
 <pre class="syntax">
-clustered_variance_linregr ( tbl_data,
-                             tbl_output,
-                             depvar,
-                             indvar,
+clustered_variance_linregr ( source_table,
+                             out_table,
+                             dependent_varname,
+                             independent_varname,
                              clustervar,
-                             groupingvar
+                             grouping_cols
                            )
 </pre>
 \b Arguments
 <dl class="arglist">
-<dt>tbl_data</dt>
-<dd>TEXT. The name of the table containing the input data.</dd>
-<dt>tbl_output</dt>
-<dd>TEXT. The name of the table to store the regression model.</dd>
-<dt>depvar</dt>
+  <dt>source_table</dt>
+  <dd>TEXT. The name of the table containing the input data.</dd>
+
+  <dt>out_table</dt>
+  <dd>VARCHAR. Name of the generated table containing the output model. The output table contains the following columns.
+    <table class="output">
+      <tr>
+        <th>coef</th>
+        <td>DOUBLE PRECISION[]. Vector of the coefficients of the regression.</td>
+      </tr>
+      <tr>
+        <th>std_err</th>
+        <td>DOUBLE PRECISION[]. Vector of the standard error of the coefficients.</td>
+      </tr>
+      <tr>
+        <th>t_stats</th>
+        <td>DOUBLE PRECISION[]. Vector of the t-stats of the coefficients.</td>
+      </tr>
+      <tr>
+        <th>p_values</th>
+        <td>DOUBLE PRECISION[]. Vector of the p-values of the coefficients.</td>
+      </tr>
+    </table>
+
+    A summary table named <out_table>_summary is also created, which is the
+    same as the summary table created by linregr_train function.
+    Please refer to the documentation for linear regression for details.
+
+  </dd>
+
+<dt>dependent_varname</dt>
 <dd>TEXT. An expression to evaluate for the dependent variable.</dd>
-<dt>indvar</dt>
+<dt>independent_varname</dt>
 <dd>TEXT. An Expression to evalue for the independent variables.</dd>
 <dt>clustervar</dt>
 <dd>TEXT. A comma-separated list of the columns to use as cluster variables.</dd>
-<dt>groupingvar (optional)</dt>
+<dt>grouping_cols (optional)</dt>
+
 <dd>TEXT, default: NULL. <em>Not currently implemented. Any non-NULL value is ignored.</em> An expression list used to group the input dataset into discrete groups, running one regression per group. Similar to the SQL GROUP BY clause. When this value is null, no grouping is used and a single result model is generated.</dd>
 </dl>
 
@@ -64,12 +91,12 @@
 
 The clustered variance logistic regression training function has the following syntax.
 <pre class="syntax">
-clustered_variance_logregr( tbl_data,
-                            tbl_output,
-                            depvar,
-                            indvar,
+clustered_variance_logregr( source_table,
+                            out_table,
+                            dependent_varname,
+                            independent_varname,
                             clustervar,
-                            groupingvar,
+                            grouping_cols,
                             max_iter,
                             optimizer,
                             tolerance,
@@ -78,17 +105,41 @@
 </pre>
 \b Arguments
 <dl class="arglist">
-    <dt>tbl_data</dt>
+    <dt>source_table</dt>
     <dd>TEXT. The name of the table containing the input data.</dd>
-    <dt>tbl_output</dt>
-    <dd>TEXT. The name of the table to store the regression model.</dd>
-    <dt>depvar</dt>
+    <dt>out_table</dt>
+    <dd>VARCHAR. Name of the generated table containing the output model. The output table has the following columns:
+      <table class="output">
+        <tr>
+          <th>coef</th>
+          <td>Vector of the coefficients of the regression.</td>
+        </tr>
+        <tr>
+          <th>std_err</th>
+          <td>Vector of the standard error of the coefficients.</td>
+        </tr>
+        <tr>
+          <th>z_stats</th>
+          <td>Vector of the z-stats of the coefficients.</td>
+        </tr>
+        <tr>
+          <th>p_values</th>
+          <td>Vector of the p-values of the coefficients.</td>
+        </tr>
+      </table>
+
+      A summary table named <out_table>_summary is also created, which is the
+      same as the summary table created by logregr_train function. Please refer
+      to the documentation for logistic regression for details.
+
+    </dd>
+    <dt>dependent_varname</dt>
     <dd>TEXT. An expression to evaluate for the dependent variable.</dd>
-    <dt>indvar</dt>
+    <dt>independent_varname</dt>
     <dd>TEXT. An expression to evaluate for the independent variable.</dd>
     <dt>clustervar</dt>
     <dd>TEXT. A comma-separated list of columns to use as cluster variables.</dd>
-    <dt>groupingvar (optional)</dt>
+    <dt>grouping_cols (optional)</dt>
     <dd>TEXT, default: NULL. <em>Not yet implemented. Any non-NULL values are ignored.</em> An expression list used to group the input dataset into discrete groups, running one regression per group. Similar to the SQL GROUP BY clause. When this value is NULL, no grouping is used and a single result model is generated.</dd>
     <dt>max_iter (optional)</dt>
     <dd>INTEGER, default: 20. The maximum number of iterations that are allowed.</dd>
@@ -129,7 +180,37 @@
     <dt>source_table</dt>
     <dd>TEXT. The name of the table containing the input data.</dd>
     <dt>out_table</dt>
-    <dd>TEXT. The name of the table to store the regression model. </dd>
+    <dd>TEXT. The name of the table where the regression model will be stored.
+    The output table has the following columns:
+        <table class="output">
+          <tr>
+            <th>category</th>
+            <td>The category.</td>
+          </tr>
+          <tr>
+            <th>ref_category</th>
+            <td>The refererence category used for modeling.</td>
+          </tr>
+          <tr>
+            <th>coef</th>
+            <td>Vector of the coefficients of the regression.</td>
+          </tr>
+          <tr>
+            <th>std_err</th>
+            <td>Vector of the standard error of the coefficients.</td>
+          </tr>
+          <tr>
+            <th>z_stats</th>
+            <td>Vector of the z-stats of the coefficients.</td>
+          </tr>
+          <tr>
+            <th>p_values</th>
+            <td>Vector of the p-values of the coefficients.</td>
+          </tr>
+        </table>
+
+        A summary table named <out_table>_summary is also created, which is the same as the summary table created by mlogregr_train function. Please refer to the documentation for multinomial logistic regression for details.
+    </dd>
     <dt>dependent_varname</dt>
     <dd>TEXT. An expression to evaluate for the dependent variable.</dd>
     <dt>independent_varname</dt>
@@ -153,20 +234,52 @@
 The clustered robust variance estimator function for the Cox
 Proportional Hazards model has the following syntax.
 <pre class="syntax">
-clustered_variance_coxph( model_table,
-                          output_table,
-                          clustervar
-                        )
+clustered_variance_coxph(model_table, output_table, clustervar)
 </pre>
 
 \b Arguments
 <dl class="arglist">
-<dt>model_table</dt>
-<dd>TEXT. The name of the table containing the input data.</dd>
-<dt>output_table</dt>
-<dd>TEXT. The name of the table to store the regression model.</dd>
-<dt>clustervar</dt>
-<dd>TEXT. A comma-separated list of columns to use as cluster variables.</dd>
+    <dt>model_table</dt>
+    <dd>TEXT. The name of the model table, which is exactaly the same as the 'output_table' parameter of coxph_train() function.</dd>
+    <dt>output_table</dt>
+    <dd>TEXT. The name of the table where the output is saved. It has the following columns:
+        <table class="output">
+            <tr>
+                <th>coef</th>
+                <td>FLOAT8[]. Vector of the coefficients.</td>
+            </tr>
+            <tr>
+                <th>loglikelihood</th>
+                <td>FLOAT8. Log-likelihood value of the MLE estimate.</td>
+            </tr>
+            <tr>
+                <th>std_err</th>
+                <td>FLOAT8[]. Vector of the standard error of the coefficients.</td>
+            </tr>
+            <tr>
+                <th>clustervar</th>
+                <td>TEXT. A comma-separated list of columns to use as cluster variables.</td>
+            </tr>
+            <tr>
+                <th>clustered_se</th>
+                <td>FLOAT8[]. Vector of the robust standard errors of the coefficients.</td>
+            </tr>
+            <tr>
+                <th>clustered_z</th>
+                <td>FLOAT8[]. Vector of the robust z-stats of the coefficients.</td>
+            </tr>
+            <tr>
+                <th>clustered_p</th>
+                <td>FLOAT8[]. Vector of the robust p-values of the coefficients.</td>
+            </tr>
+            <tr>
+                <th>hessian</th>
+                <td>FLOAT8[]. The Hessian matrix.</td>
+            </tr>
+        </table>
+    </dd>
+    <dt>clustervar</dt>
+    <dd>TEXT. A comma-separated list of columns to use as cluster variables.</dd>
 </dl>
 
 @anchor examples
@@ -179,15 +292,15 @@
 
 -# Run the linear regression function and view the results.
 <pre class="example">
-DROP TABLE IF EXISTS tbl_output;
+DROP TABLE IF EXISTS out_table;
 SELECT madlib.clustered_variance_linregr( 'abalone',
-                                          'tbl_output',
+                                          'out_table',
                                           'rings',
                                           'ARRAY[1, diameter, length, width]',
                                           'sex',
                                           NULL
                                         );
-SELECT * FROM tbl_output;
+SELECT * FROM out_table;
 </pre>
 
 -# View online help for the clustered variance logistic regression function.
@@ -197,14 +310,14 @@
 
 -# Run the logistic regression function and view the results.
 <pre class="example">
-DROP TABLE IF EXISTS tbl_output;
+DROP TABLE IF EXISTS out_table;
 SELECT madlib.clustered_variance_logregr( 'abalone',
-                                          'tbl_output',
+                                          'out_table',
                                           'rings < 10',
                                           'ARRAY[1, diameter, length, width]',
                                           'sex'
                                         );
-SELECT * FROM tbl_output;
+SELECT * FROM out_table;
 </pre>
 
 -# View online help for the clustered variance multinomial logistic regression function.
@@ -214,15 +327,15 @@
 
 -# Run the multinomial logistic regression and view the results.
 <pre class="example">
-DROP TABLE IF EXISTS tbl_output;
+DROP TABLE IF EXISTS out_table;
 SELECT madlib.clustered_variance_mlogregr( 'abalone',
-                                           'tbl_output',
+                                           'out_table',
                                            'CASE WHEN rings < 10 THEN 1 ELSE 0 END',
                                            'ARRAY[1, diameter, length, width]',
                                            'sex',
                                            0
                                          );
-SELECT * FROM tbl_output;
+SELECT * FROM out_table;
 </pre>
 
 -# Run the Cox Proportional Hazards regression and compute the clustered robust estimator.
@@ -246,7 +359,9 @@
 @anchor notes
 @par Notes
 
-- Note that we need to manually include an intercept term in the independent variable expression. The NULL value of <em>groupingvar</em> means that there is no grouping in the calculation.
+- Note that we need to manually include an intercept term in the independent
+variable expression. The NULL value of <em>groupingvar</em> means that there
+is no grouping in the calculation.
 
 
 @anchor background
@@ -285,14 +400,15 @@
 \f]
 where \f$G_m\f$ is the set of rows that belong to the same cluster.
 
-We can compute the quantities of \f$B\f$ and \f$A\f$ for each cluster during one scan through
-the data table in an aggregate function. Then sum over all clusters to
-the full \f$B\f$ and \f$A\f$ in the outside of the aggregate function. At last, the matrix mulplitications
-are
-done in a separate function on the master node.
+We can compute the quantities of \f$B\f$ and \f$A\f$ for each cluster during
+one scan through the data table in an aggregate function. Then sum over all
+clusters to the full \f$B\f$ and \f$A\f$ in the outside of the aggregate function.
+At last, the matrix mulplitications are done in a separate function on the
+master node.
 
 When multinomial logistic regression is computed before the multinomial
-clustered variance calculation, it uses a default reference category of zero and the regression coefficients are included in the output table.  The
+clustered variance calculation, it uses a default reference category of zero and
+ the regression coefficients are included in the output table.  The
 regression coefficients in the output are in the same order as multinomial
 logistic regression function, which is described below.
 For a problem with
@@ -326,8 +442,8 @@
 /**
  * @brief Compute the clustered errors
  *
- * @param tbl_data Data table name
- * @param tbl_output The result table
+ * @param source_table Data table name
+ * @param out_table The result table
  * @param depvar Dependent variable expression
  * @param indvar Independent variable expression
  * @param clustervar The expressions used to clustering
@@ -335,8 +451,8 @@
  */
 
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.clustered_variance_linregr (
-    tbl_data        TEXT,
-    tbl_output      TEXT,
+    source_table    TEXT,
+    out_table      TEXT,
     depvar          TEXT,
     indvar          TEXT,
     clustervar      TEXT,
@@ -348,14 +464,14 @@
 ------------------------------------------------------------------------
 
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.clustered_variance_linregr (
-    tbl_data        TEXT,
-    tbl_output      TEXT,
+    source_table        TEXT,
+    out_table      TEXT,
     depvar          TEXT,
     indvar          TEXT,
     clustervar      TEXT
 ) RETURNS VOID AS $$
 BEGIN
-    PERFORM MADLIB_SCHEMA.clustered_variance_linregr(tbl_data, tbl_output, depvar, indvar, clustervar, NULL);
+    PERFORM MADLIB_SCHEMA.clustered_variance_linregr(source_table, out_table, depvar, indvar, clustervar, NULL);
 END;
 $$ LANGUAGE plpgsql;
 
@@ -490,8 +606,8 @@
 ------------------------------------------------------------------------
 
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.clustered_variance_logregr (
-    tbl_data        TEXT,
-    tbl_output      TEXT,
+    source_table        TEXT,
+    out_table      TEXT,
     depvar          TEXT,
     indvar          TEXT,
     clustervar      TEXT,
@@ -507,14 +623,14 @@
 ------------------------------------------------------------------------
 
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.clustered_variance_logregr (
-    tbl_data        TEXT,
-    tbl_output      TEXT,
+    source_table        TEXT,
+    out_table      TEXT,
     depvar          TEXT,
     indvar          TEXT,
     clustervar      TEXT
 ) RETURNS VOID AS $$
 BEGIN
-    PERFORM MADLIB_SCHEMA.clustered_variance_logregr(tbl_data, tbl_output, depvar, indvar, clustervar,
+    PERFORM MADLIB_SCHEMA.clustered_variance_logregr(source_table, out_table, depvar, indvar, clustervar,
                                                     NULL, 20, 'irls', 0.0001, False);
 END;
 $$ LANGUAGE plpgsql;
@@ -522,15 +638,15 @@
 ------------------------------------------------------------------------
 
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.clustered_variance_logregr (
-    tbl_data        TEXT,
-    tbl_output      TEXT,
+    source_table        TEXT,
+    out_table      TEXT,
     depvar          TEXT,
     indvar          TEXT,
     clustervar      TEXT,
     grouping_col    TEXT
 ) RETURNS VOID AS $$
 BEGIN
-    PERFORM MADLIB_SCHEMA.clustered_variance_logregr(tbl_data, tbl_output, depvar, indvar, clustervar,
+    PERFORM MADLIB_SCHEMA.clustered_variance_logregr(source_table, out_table, depvar, indvar, clustervar,
                                                     grouping_col, 20, 'irls', 0.0001, False);
 END;
 $$ LANGUAGE plpgsql;
@@ -538,8 +654,8 @@
 ------------------------------------------------------------------------
 
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.clustered_variance_logregr (
-    tbl_data        TEXT,
-    tbl_output      TEXT,
+    source_table        TEXT,
+    out_table      TEXT,
     depvar          TEXT,
     indvar          TEXT,
     clustervar      TEXT,
@@ -547,7 +663,7 @@
     max_iter        INTEGER
 ) RETURNS VOID AS $$
 BEGIN
-    PERFORM MADLIB_SCHEMA.clustered_variance_logregr(tbl_data, tbl_output, depvar, indvar, clustervar,
+    PERFORM MADLIB_SCHEMA.clustered_variance_logregr(source_table, out_table, depvar, indvar, clustervar,
                                                     grouping_col, max_iter, 'irls', 0.0001, False);
 END;
 $$ LANGUAGE plpgsql;
@@ -555,8 +671,8 @@
 ------------------------------------------------------------------------
 
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.clustered_variance_logregr (
-    tbl_data        TEXT,
-    tbl_output      TEXT,
+    source_table        TEXT,
+    out_table      TEXT,
     depvar          TEXT,
     indvar          TEXT,
     clustervar      TEXT,
@@ -565,7 +681,7 @@
     optimizer       TEXT
 ) RETURNS VOID AS $$
 BEGIN
-    PERFORM MADLIB_SCHEMA.clustered_variance_logregr(tbl_data, tbl_output, depvar, indvar, clustervar,
+    PERFORM MADLIB_SCHEMA.clustered_variance_logregr(source_table, out_table, depvar, indvar, clustervar,
                                                     grouping_col, max_iter, optimizer, 0.0001, False);
 END;
 $$ LANGUAGE plpgsql;
@@ -573,8 +689,8 @@
 ------------------------------------------------------------------------
 
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.clustered_variance_logregr (
-    tbl_data        TEXT,
-    tbl_output      TEXT,
+    source_table        TEXT,
+    out_table      TEXT,
     depvar          TEXT,
     indvar          TEXT,
     clustervar      TEXT,
@@ -584,7 +700,7 @@
     tolerance       DOUBLE PRECISION
 ) RETURNS VOID AS $$
 BEGIN
-    PERFORM MADLIB_SCHEMA.clustered_variance_logregr(tbl_data, tbl_output, depvar, indvar, clustervar,
+    PERFORM MADLIB_SCHEMA.clustered_variance_logregr(source_table, out_table, depvar, indvar, clustervar,
                                                     grouping_col, max_iter, optimizer, tolerance, False);
 END;
 $$ LANGUAGE plpgsql;

diff --git a/src/ports/postgres/modules/regress/linear.py_in b/src/ports/postgres/modules/regress/linear.py_in
index 4348165..b1e504b 100644
--- a/src/ports/postgres/modules/regress/linear.py_in
+++ b/src/ports/postgres/modules/regress/linear.py_in

@@ -250,7 +250,7 @@
              'num_rows_processed'       INTEGER,            -- Number of rows that are actually used in each group
              'num_missing_rows_skipped' INTEGER             -- Number of rows that have NULL and are skipped in each group
 
-        A summary table is also created at the same time, which has:
+        A summary table named <out_table>_summary is also created at the same time, which has:
             'source_table'              VARCHAR,    -- the data source table name
             'out_table'                 VARCHAR,    -- the output table name
             'dependent_varname'         VARCHAR,    -- the dependent variable

diff --git a/src/ports/postgres/modules/regress/linear.sql_in b/src/ports/postgres/modules/regress/linear.sql_in
index 1145cd4..6c22bcf 100644
--- a/src/ports/postgres/modules/regress/linear.sql_in
+++ b/src/ports/postgres/modules/regress/linear.sql_in

@@ -101,7 +101,32 @@
   <td>FLOAT8. The Breush-Pagan calculated p-value. Present only if
   the heteroskedacity parameter was set to True when the model was
   trained.</td>
+  </tr>
+  <tr>
+  <th>num_rows_processed</th>
+  <td>INTEGER. The number of rows that are actually used in each group.</td>
+  </tr>
+  <tr>
+  <th>num_missing_rows_skipped</th>
+  <td>INTEGER. The number of rows that have NULL values in the dependent and independent variables, and were skipped in the computation for each group.</td>
   </tr></table>
+
+  A summary table named <out_table>_summary is created together with the output table. It has the following columns:
+ <table class="output">
+  <tr>
+<th>source_table</th>
+<td>The data source table name</td></tr>
+<tr><th>out_table</th>
+<td>The output table name</td></tr>
+<tr><th>dependent_varname</th>
+<td>The dependent variable</td></tr>
+<tr><th>independent_varname</th>
+<td>The independent variables</td></tr>
+<tr><th>num_rows_processed</th>
+<td>The total number of rows that were used in the computation.</td></tr>
+<tr><th>num_missing_rows_skipped</th>
+<td>The total number of rows that were skipped because of NULL values in them.</td>
+    </tr></table>
 </dd>
 
 <dd>@note For p-values, we just return the computation result directly.
@@ -144,9 +169,7 @@
 @anchor predict
 @par Prediction Function
 <pre class="syntax">
-linregr_predict( coef,
-                 col_ind
-               )
+linregr_predict(coef, col_ind)
 </pre>
 \b Arguments
 <dl class="arglist">

diff --git a/src/ports/postgres/modules/regress/logistic.py_in b/src/ports/postgres/modules/regress/logistic.py_in
index 39b6ad6..4746aa7 100644
--- a/src/ports/postgres/modules/regress/logistic.py_in
+++ b/src/ports/postgres/modules/regress/logistic.py_in

@@ -425,7 +425,7 @@
             'num_missing_rows_skipped', integer,            -- number of rows that contain NULL and were skipped per group
             'num_iterations'            double precision    -- how many iterations are used in the computation per group
 
-        A summary table is also created at the same time, which has:
+        A summary table named <out_table>_summary is also created at the same time, which has:
             'source_table'              varchar,    -- the data source table name
             'out_table'                 varchar,    -- the output table name
             'dependent_varname'         varchar,    -- the dependent variable

diff --git a/src/ports/postgres/modules/regress/logistic.sql_in b/src/ports/postgres/modules/regress/logistic.sql_in
index 84bd70f..400cd67 100644
--- a/src/ports/postgres/modules/regress/logistic.sql_in
+++ b/src/ports/postgres/modules/regress/logistic.sql_in

@@ -123,6 +123,54 @@
     if the independent_varname is NULL or contains NULL values.</td>
       </tr>
     </table>
+
+    A summary table named <out_table>_summary is also created at the same time, which has the following columns:
+     <table class="output">
+    <tr>
+    <th>source_table</th>
+    <td>The data source table name.</td>
+    </tr>
+
+    <tr>
+    <th>out_table</th>
+    <td>The output table name.</td>
+    </tr>
+
+    <tr>
+    <th>dependent_varname</th>
+    <td>The dependent variable.</td>
+    </tr>
+
+    <tr>
+    <th>independent_varname</th>
+    <td>The independent variables</td>
+    </tr>
+
+    <tr>
+    <th>optimizer_params</th>
+    <td>A string that contains all the optimizer parameters, and has the form of 'optimizer=..., max_iter=..., tolerance=...'</td>
+    </tr>
+
+    <tr>
+    <th>num_all_groups</th>
+    <td>How many groups of data were fit by the logistic model.</td>
+    </tr>
+
+    <tr>
+    <th>num_failed_groups</th>
+    <td>How many groups' fitting processes failed.</td>
+    </tr>
+
+    <tr>
+    <th>num_rows_processed</th>
+    <td>The total number of rows usd in the computation.</td>
+    </tr>
+
+    <tr>
+    <th>num_missing_rows_skipped</th>
+    <td>The total number of rows skipped.</td>
+    </tr>
+   </table>
   </DD>
 
   <DT>dependent_varname</DT>

diff --git a/src/ports/postgres/modules/regress/marginal.sql_in b/src/ports/postgres/modules/regress/marginal.sql_in
index 064fb8f..91b01af 100644
--- a/src/ports/postgres/modules/regress/marginal.sql_in
+++ b/src/ports/postgres/modules/regress/marginal.sql_in

@@ -28,7 +28,7 @@
 </ul>
 </div>
 
-@brief Calculates marginal effects for the coefficients in logistic and multinomial logistic regression problems. 
+@brief Calculates marginal effects for the coefficients in logistic and multinomial logistic regression problems.
 
 A marginal effect (ME) or partial effect measures the effect on the
 conditional mean of \f$ y \f$ of a change in one of the regressors, say
@@ -82,6 +82,9 @@
                 <td>DOUBLE PRECISION[]. An array of the Wald p-values of the marginal effects.</td>
             </tr>
         </table>
+
+        A summary table named <output_table>_summary is also created, which is the same as the summary table created by logregr_train function. Please refer to the documentation for logistic regression for details.
+
     </dd>
     <dt>dependent_variable</dt>
     <dd>VARCHAR. Name of column for dependent variables.</dd>
@@ -147,6 +150,9 @@
                 <td>DOUBLE PRECISION[]. An array of the Wald p-values of the marginal effects.</td>
             </tr>
         </table>
+
+        A summary table named <out_table>_summary is also created, which is the same as the summary table created by mlogregr_train function. Please refer to the documentation for multinomial logistic regression for details.
+
     </dd>
     <dt>dependent_varname</dt>
     <dd>VARCHAR. Name of column for dependent variables.</dd>
@@ -796,9 +802,6 @@
 */
 --------------------------------------------------------------------------------
 
--- DEPRECATED NOTICE -----------------------------------------------------------
--- The below functions has been deprecated and should be removed in next major
---    version update
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.margins_mlogregr(
      source_table               VARCHAR       -- name of input  table
    , out_table                  VARCHAR       -- name of output table
@@ -814,7 +817,6 @@
 PythonFunction(regress, marginal, margins_mlogregr_main)
 $$ LANGUAGE plpythonu;
 
-
 /**
   * @brief Marginal effects with default variable_names
  **/
@@ -832,44 +834,9 @@
   SELECT MADLIB_SCHEMA.margins_mlogregr($1, $2, $3, $4, $5, $6, $7, $8, FALSE);
 $$ LANGUAGE sql VOLATILE;
 
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.margins_mlogregr(
-     source_table               VARCHAR       -- name of input  table
-   , out_table                  VARCHAR       -- name of output table
-   , dependent_varname          VARCHAR       -- name of dependent variable
-   , independent_varname        VARCHAR       -- name of independent variable
-   , ref_category               INTEGER       -- reference category
-   , grouping_cols              VARCHAR       -- names of columns to group by
-   , marginal_vars              INTEGER[]     -- indices of variables to calculate marginal effects on
-   , max_iter                   INTEGER       -- Max iterations for the logstic regression inner call
-   , optimizer                  VARCHAR       -- Logistic regression optimizer
-   , tolerance                  DOUBLE PRECISION -- Tolerance
-   , verbose_mode               BOOLEAN       -- Verbose mode
-  )
-RETURNS VOID AS $$
-    SELECT MADLIB_SCHEMA.margins_mlogregr($1, $2, $3, $4, $5, $6, $7,
-                                          'max_iter=' || cast($8 as text) ||
-                                          ', optimizer=' || $9 ||
-                                          ', tolerance=' || cast($10 as text),
-                                          $11)
-$$ LANGUAGE SQL;
--- END OF DEPRECATED NOTICE -----------------------------------------------------------
-
-/**
-  * @brief Marginal effects with default variable_names
- **/
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.margins_mlogregr(
-     source_table               VARCHAR       -- name of input  table
-   , out_table                  VARCHAR       -- name of output table
-   , dependent_varname          VARCHAR       -- name of dependent variable
-   , independent_varname        VARCHAR       -- name of independent variable
-   , ref_category               INTEGER       -- reference category
-   , grouping_cols              VARCHAR       -- names of columns to group by
-   , marginal_vars              INTEGER[]     -- indices of variables to calculate marginal effects on
-   , optimizer_params           VARCHAR       -- a comma-separated string with optimizer parameters
-  )
-RETURNS VOID AS $$
-  SELECT MADLIB_SCHEMA.margins_mlogregr($1, $2, $3, $4, $5, $6, $7, $8, FALSE);
-$$ LANGUAGE sql VOLATILE;
+-- DEPRECATED NOTICE -----------------------------------------------------------
+-- The below functions has been deprecated and should be removed in next major
+--    version update
 
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.margins_mlogregr(
      source_table               VARCHAR       -- name of input  table
@@ -1016,7 +983,6 @@
 $$ LANGUAGE plpgsql VOLATILE;
 -- End of Default Variable calls for margins_mlogregr
 ------------------------------------------------------------------------------
-
 -- END OF DEPRECATED NOTICE -----------------------------------------------------------
 
 

diff --git a/src/ports/postgres/modules/regress/multilogistic.py_in b/src/ports/postgres/modules/regress/multilogistic.py_in
index e29e8a4..f6d4317 100644
--- a/src/ports/postgres/modules/regress/multilogistic.py_in
+++ b/src/ports/postgres/modules/regress/multilogistic.py_in

@@ -580,13 +580,15 @@
      std_err        -- DOUBLE PRECISION[], Standard errors
      z_stats        -- DOUBLE PRECISION[], z-stats of the standard errors
      p_values       -- DOUBLE PRECISION[], p-values of the standard errors
+     odds_ratio     -- DOUBLE PRECISION[], An array of the odds ratios, exp(coef)
+     condition_no   -- DOUBLE PRECISION, The condition number of the fitting.
      num_iterations -- INTEGER, Number of iterations performed by the optimizer
 
-The output summary table is named as <'output_table'>_summary has the following columns
+The output summary table named as <'output_table'>_summary has the following columns
     source_table             -- VARCHAR, Source table name
     out_table                -- VARCHAR, Output table name
-    dep_var                  -- VARCHAR, Dependent variable name
-    ind_var                  -- VARCHAR, Independent variable name
+    dependent_varname        -- VARCHAR, Dependent variable name
+    independent_varname      -- VARCHAR, Independent variable name
     optimizer_params         -- VARCHAR, Optimizer parameters used
     ref_category             -- INTEGER, The value of reference category used
     num_rows_processed       -- INTEGER, Number of rows processed during training

diff --git a/src/ports/postgres/modules/regress/multilogistic.sql_in b/src/ports/postgres/modules/regress/multilogistic.sql_in
index 2f5f329..e7a3752 100644
--- a/src/ports/postgres/modules/regress/multilogistic.sql_in
+++ b/src/ports/postgres/modules/regress/multilogistic.sql_in

@@ -1,5 +1,5 @@
 /* ----------------------------------------------------------------------- *//**
-*
+ *
  * @file multilogistic.sql_in
  *
  * @brief SQL functions for multinomial logistic regression
@@ -103,6 +103,35 @@
             <td>INTEGER. The number of iterations executed before the
             algorithm completed.</td>
         </tr>
+    </table>
+
+    A summary table named <out_table>_summary is also created at the same time, and it contains the following columns:
+
+    <table class="output">
+        <tr>
+            <th>source_table</th>
+            <td>The data source table name.</td>
+        </tr>
+        <tr>
+            <th>out_table</th>
+            <td>The output table name.</td>
+        </tr>
+        <tr>
+            <th>dependent_varname</th>
+            <td>The dependent variable.</td>
+        </tr>
+        <tr>
+            <th>independent_varname</th>
+            <td>The independent variables.</td>
+        </tr>
+        <tr>
+            <th>optimizer_params</th>
+            <td>The optimizer parameters. It is a copy of the optimizer_params in the training function's arguments.</td>
+        </tr>
+        <tr>
+            <th>ref_category</th>
+            <td>An integer, the value of reference category used.</td>
+        </tr>
         <tr>
             <th>num_rows_processed</th>
             <td>INTEGER. The number of rows actually processed, which is equal to the

diff --git a/src/ports/postgres/modules/regress/robust.sql_in b/src/ports/postgres/modules/regress/robust.sql_in
index d9e63ac..11eb33a 100644
--- a/src/ports/postgres/modules/regress/robust.sql_in
+++ b/src/ports/postgres/modules/regress/robust.sql_in

@@ -28,7 +28,7 @@
 </ul>
 </div>
 
-@brief Calculates Huber-White variance estimates for linear, logistic, and multinomial regression models, and for Cox proportional hazards models. 
+@brief Calculates Huber-White variance estimates for linear, logistic, and multinomial regression models, and for Cox proportional hazards models.
 
 The functions in this module calculate robust variance (Huber-White estimates)
 for linear regression, logistic regression, multinomial logistic regression,
@@ -61,17 +61,7 @@
 </pre>
 <dl class="arglist">
   <dt>source_table</dt>
-  <dd>VARCHAR. The name of the table containing the training data.
-    The training data is expected to be of the following form:
-    @verbatim
-    {TABLE|VIEW} sourceName (
-    outputTable              VARCHAR,
-    regressionType           VARCHAR,
-    dependentVariable        VARCHAR,
-    independentVariable      VARCHAR
-    )
-    @endverbatim
-  </dd>
+  <dd>VARCHAR. The name of the table containing the training data.</dd>
   <dt>out_table</dt>
   <dd>VARCHAR. Name of the generated table containing the output model. The output table contains the following columns.
     <table class="output">
@@ -92,6 +82,8 @@
         <td>DOUBLE PRECISION[]. Vector of the p-values of the coefficients.</td>
       </tr>
     </table>
+
+    A summary table named <out_table>_summary is also created, which is the same as the summary table created by linregr_train function. Please refer to the documentation for linear regression for details.
   </dd>
   <dt>dependent_varname</dt>
   <dd>VARCHAR. The name of the column containing the dependent variable.</dd>
@@ -147,6 +139,8 @@
       <td>Vector of the p-values of the coefficients.</td>
     </tr>
   </table>
+
+  A summary table named <out_table>_summary is also created, which is the same as the summary table created by logregr_train function. Please refer to the documentation for logistic regression for details.
 </dd>
 <dt>dependent_varname</dt>
 <dd>VARCHAR. The name of the column containing the independent variable.</dd>
@@ -195,32 +189,34 @@
 <dt>out_table</dt>
 <dd>VARCHAR. The name of the table where the regression model will be stored.
 The output table has the following columns:
-<table class="output">
-  <tr>
-    <th>category</th>
-    <td>The category.</td>
-  </tr>
-  <tr>
-    <th>ref_category</th>
-    <td>The refererence category used for modeling.</td>
-  </tr>
-  <tr>
-    <th>coef</th>
-    <td>Vector of the coefficients of the regression.</td>
-  </tr>
-  <tr>
-    <th>std_err</th>
-    <td>Vector of the standard error of the coefficients.</td>
-  </tr>
-  <tr>
-    <th>z_stats</th>
-    <td>Vector of the z-stats of the coefficients.</td>
-  </tr>
-  <tr>
-    <th>p_values</th>
-    <td>Vector of the p-values of the coefficients.</td>
-  </tr>
-</table>
+    <table class="output">
+      <tr>
+        <th>category</th>
+        <td>The category.</td>
+      </tr>
+      <tr>
+        <th>ref_category</th>
+        <td>The refererence category used for modeling.</td>
+      </tr>
+      <tr>
+        <th>coef</th>
+        <td>Vector of the coefficients of the regression.</td>
+      </tr>
+      <tr>
+        <th>std_err</th>
+        <td>Vector of the standard error of the coefficients.</td>
+      </tr>
+      <tr>
+        <th>z_stats</th>
+        <td>Vector of the z-stats of the coefficients.</td>
+      </tr>
+      <tr>
+        <th>p_values</th>
+        <td>Vector of the p-values of the coefficients.</td>
+      </tr>
+    </table>
+
+    A summary table named <out_table>_summary is also created, which is the same as the summary table created by mlogregr_train function. Please refer to the documentation for multinomial logistic regression for details.
 </dd>
 <dt>dependent_varname</dt>
 <dd>VARCHAR. The name of the column containing the dependent variable.</dd>
@@ -257,8 +253,7 @@
     <dt>model_table</dt>
     <dd>TEXT. The name of the model table, which is exactaly the same as the 'output_table' parameter of coxph_train() function.</dd>
     <dt>output_table</dt>
-    <dd>TEXT. The name of the table where the output is saved.
-        The output is saved in the table named by the <em>output_table</em> argument. It has the following columns:
+    <dd>TEXT. The name of the table where the output is saved. It has the following columns:
         <table class="output">
             <tr>
                 <th>coef</th>

diff --git a/src/ports/postgres/modules/stats/clustered_variance_coxph.py_in b/src/ports/postgres/modules/stats/clustered_variance_coxph.py_in
index 7187666..78c4588 100644
--- a/src/ports/postgres/modules/stats/clustered_variance_coxph.py_in
+++ b/src/ports/postgres/modules/stats/clustered_variance_coxph.py_in

@@ -138,7 +138,7 @@
         create table {output_table} as
             select
                 u.coef, u.loglikelihood, u.std_err,
-                '{clustervar}' as clustervar,
+                '{clustervar}'::TEXT as clustervar,
                 (v.f).std_err as clustered_se,
                 (v.f).z_stats as clustered_z,
                 (v.f).p_values as clustered_p, 
@@ -326,4 +326,4 @@
 
         For more details on function usage:
             SELECT madlib.clustered_variance_coxph('usage');
-        """
\ No newline at end of file
+        """

diff --git a/src/ports/postgres/modules/stats/cox_prop_hazards.py_in b/src/ports/postgres/modules/stats/cox_prop_hazards.py_in
index 65627f4..4a26c4a 100644
--- a/src/ports/postgres/modules/stats/cox_prop_hazards.py_in
+++ b/src/ports/postgres/modules/stats/cox_prop_hazards.py_in

@@ -40,108 +40,115 @@
     """
     if not message:
         help_string = """
-        -----------------------------------------------------------------------
-                                    SUMMARY
-        -----------------------------------------------------------------------
-        Functionality: Cox proprtional hazards regression (Breslow method)
-        Proportional-Hazard models enable the comparison of various survival models.
-        These survival models are functions describing the probability of a one-item
-        event (prototypically, this event is death) with respect to time.
-        The interval of time before death occurs is the survival time.
-        Let T be a random variable representing the survival time,
-        with a cumulative probability function P(t). Informally, P(t) is
-        the probability that death has happened before time t.
+-----------------------------------------------------------------------
+                            SUMMARY
+-----------------------------------------------------------------------
+Functionality: Cox proprtional hazards regression (Breslow method)
+Proportional-Hazard models enable the comparison of various survival models.
+These survival models are functions describing the probability of a one-item
+event (prototypically, this event is death) with respect to time.
+The interval of time before death occurs is the survival time.
+Let T be a random variable representing the survival time,
+with a cumulative probability function P(t). Informally, P(t) is
+the probability that death has happened before time t.
 
-        For more details on function usage:
-            SELECT {schema_madlib}.coxph_train('usage')
-        For an example on using the function:
-            SELECT {schema_madlib}.coxph_train('example')
-        """
+For more details on function usage:
+    SELECT {schema_madlib}.coxph_train('usage')
+For an example on using the function:
+    SELECT {schema_madlib}.coxph_train('example')
+"""
+
     elif message in ['usage', 'help', '?']:
         help_string = """
-        -----------------------------------------------------------------------
-                                        USAGE
-        -----------------------------------------------------------------------
-         SELECT {schema_madlib}.coxph_train(
-         'source_table',            -- Name of data table
-         'output_table',               -- Name of result table (overwrites if exists)
-         'dependent_variable',      -- Name of column for dependent variables
-         'independent_variable',    -- Name of column for independent variables
-                                          (can be any SQL expression Eg: '*')
-         'right_censoring_status',  -- Name of the column containing censoring status
-                                            0/false : If the observation is censored
-                                            1/true : otherwise
-                                        Can also be an SQL expression: 'dependent_variable < 10')
-                                       (Optional, DEFAULT = TRUE)
-         'strata',                  -- The stratification column names. (Optional, DEFAULT = NULL)
-         'optimizer_params'         -- The optimizer parameters as a comma-separated string
-         );
+-----------------------------------------------------------------------
+                                USAGE
+-----------------------------------------------------------------------
+ SELECT {schema_madlib}.coxph_train(
+ 'source_table',            -- Name of data table
+ 'output_table',               -- Name of result table (overwrites if exists)
+ 'dependent_variable',      -- Name of column for dependent variables
+ 'independent_variable',    -- Name of column for independent variables
+                                  (can be any SQL expression Eg: '*')
+ 'right_censoring_status',  -- Name of the column containing censoring status
+                                    0/false : If the observation is censored
+                                    1/true : otherwise
+                                Can also be an SQL expression: 'dependent_variable < 10')
+                               (Optional, DEFAULT = TRUE)
+ 'strata',                  -- The stratification column names. (Optional, DEFAULT = NULL)
+ 'optimizer_params'         -- The optimizer parameters as a comma-separated string
+ );
 
-        -----------------------------------------------------------------------
-                                        OUTUPT
-        -----------------------------------------------------------------------
-        The output table ('output_table' above) has the following columns
-             'coef'          DOUBLE PRECISION[], -- Coefficients of regression
-             'loglikelihood' DOUBLE PRECISION,   -- Log-likelihood value
-             'std_err'       DOUBLE PRECISION[], -- Standard errors
-             'z_stats'       DOUBLE PRECISION[], -- z-stats of the standard errors
-             'p_values'      DOUBLE PRECISION[], -- p-values of the standard errors
-             'num_iterations'      INTEGER       -- Number of iterations performed by the optimizer
+-----------------------------------------------------------------------
+                                OUTUPT
+-----------------------------------------------------------------------
+The output table ('output_table' above) has the following columns
+     'coef'          DOUBLE PRECISION[], -- Coefficients of regression
+     'loglikelihood' DOUBLE PRECISION,   -- Log-likelihood value
+     'std_err'       DOUBLE PRECISION[], -- Standard errors
+     'z_stats'       DOUBLE PRECISION[], -- z-stats of the standard errors
+     'p_values'      DOUBLE PRECISION[], -- p-values of the standard errors
+     'num_iterations'      INTEGER       -- Number of iterations performed by the optimizer
 
-        The output summary table is named as <output_table>_summary has the following columns
-            'source_table'   VARCHAR,            -- source table name
-            'dep_var'        VARCHAR,            -- dependent variable name
-            'ind_var'        VARCHAR,            -- independent variable name
-            'right_censoring_status' VARCHAR,    -- right censoring status
-            'strata'         VARCHAR             -- stratification columns
+The output summary table is named as <output_table>_summary has the following columns
+    'source_table'            VARCHAR, Source table name
+    'dep_var'                 VARCHAR, Dependent variable name
+    'ind_var'                 VARCHAR, Independent variable name
+    'right_censoring_status'  VARCHAR, Right censoring status
+    'strata'                  VARCHAR, Stratification columns
+    num_rows_processed        INTEGER, Number of rows processed during training
+    num_missing_rows_skipped  INTEGER, Number of rows skipped during training
+                                        due to missing values
+
         """
+
     elif message in ['example', 'examples']:
         help_string = """
-        DROP TABLE IF EXISTS sample_data;
-        CREATE TABLE sample_data (
-            id INTEGER NOT NULL,
-            grp DOUBLE PRECISION,
-            wbc DOUBLE PRECISION,
-            timedeath INTEGER,
-            status BOOLEAN
-        );
+DROP TABLE IF EXISTS sample_data;
+CREATE TABLE sample_data (
+    id INTEGER NOT NULL,
+    grp DOUBLE PRECISION,
+    wbc DOUBLE PRECISION,
+    timedeath INTEGER,
+    status BOOLEAN
+);
 
-        COPY sample_data FROM STDIN DELIMITER '|';
-          0 |   0 | 1.45 |        35 | t
-          1 |   0 | 1.47 |        34 | t
-          3 |   0 |  2.2 |        32 | t
-          4 |   0 | 1.78 |        25 | t
-          5 |   0 | 2.57 |        23 | t
-          6 |   0 | 2.32 |        22 | t
-          7 |   0 | 2.01 |        20 | t
-          8 |   0 | 2.05 |        19 | t
-          9 |   0 | 2.16 |        17 | t
-         10 |   0 |  3.6 |        16 | t
-         11 |   1 |  2.3 |        15 | t
-         12 |   0 | 2.88 |        13 | t
-         13 |   1 |  1.5 |        12 | t
-         14 |   0 |  2.6 |        11 | t
-         15 |   0 |  2.7 |        10 | t
-         16 |   0 |  2.8 |         9 | t
-         17 |   1 | 2.32 |         8 | t
-         18 |   0 | 4.43 |         7 | t
-         19 |   0 | 2.31 |         6 | t
-         20 |   1 | 3.49 |         5 | t
-         21 |   1 | 2.42 |         4 | t
-         22 |   1 | 4.01 |         3 | t
-         23 |   1 | 4.91 |         2 | t
-         24 |   1 |    5 |         1 | t
-        \.
+COPY sample_data FROM STDIN DELIMITER '|';
+  0 |   0 | 1.45 |        35 | t
+  1 |   0 | 1.47 |        34 | t
+  3 |   0 |  2.2 |        32 | t
+  4 |   0 | 1.78 |        25 | t
+  5 |   0 | 2.57 |        23 | t
+  6 |   0 | 2.32 |        22 | t
+  7 |   0 | 2.01 |        20 | t
+  8 |   0 | 2.05 |        19 | t
+  9 |   0 | 2.16 |        17 | t
+ 10 |   0 |  3.6 |        16 | t
+ 11 |   1 |  2.3 |        15 | t
+ 12 |   0 | 2.88 |        13 | t
+ 13 |   1 |  1.5 |        12 | t
+ 14 |   0 |  2.6 |        11 | t
+ 15 |   0 |  2.7 |        10 | t
+ 16 |   0 |  2.8 |         9 | t
+ 17 |   1 | 2.32 |         8 | t
+ 18 |   0 | 4.43 |         7 | t
+ 19 |   0 | 2.31 |         6 | t
+ 20 |   1 | 3.49 |         5 | t
+ 21 |   1 | 2.42 |         4 | t
+ 22 |   1 | 4.01 |         3 | t
+ 23 |   1 | 4.91 |         2 | t
+ 24 |   1 |    5 |         1 | t
+\.
 
-        SELECT {schema_madlib}.coxph_train(
-            'sample_data',
-            'sample_cox',
-            'timedeath',
-            'ARRAY[grp,wbc]',
-            'status');
+SELECT {schema_madlib}.coxph_train(
+    'sample_data',
+    'sample_cox',
+    'timedeath',
+    'ARRAY[grp,wbc]',
+    'status');
 
-        SELECT * FROM sample_cox;
+SELECT * FROM sample_cox;
         """
+
     else:
         help_string = "No such option. Use {schema_madlib}.coxph_train()"
 
@@ -546,18 +553,17 @@
             iteration=iteration,
             sourceAlias="src",
             madlib_iterative_alg=madlib_iterative_alg))
-        if (plpy.execute(checkForNullStateSQL.format(
-                                iteration=iteration,
-                                madlib_iterative_alg=madlib_iterative_alg)
+        if (plpy.execute(checkForNullStateSQL.format(iteration=iteration,
+                                                     madlib_iterative_alg=madlib_iterative_alg)
                          )[0]['should_terminate'] or
                 (iteration > cyclesPerIteration and
                     (iteration >= cyclesPerIteration * maxNumIterations or
                         plpy.execute(terminateSQL.format(
-                                iteration=iteration,
-                                cyclesPerIteration=cyclesPerIteration,
-                                oldState="(older._madlib_state)",
-                                newState="(newer._madlib_state)")
-                                )[0]['should_terminate']))):
+                            iteration=iteration,
+                            cyclesPerIteration=cyclesPerIteration,
+                            oldState="(older._madlib_state)",
+                            newState="(newer._madlib_state)")
+                            )[0]['should_terminate']))):
             break
 
     # Note: We do not drop the temporary table
@@ -760,13 +766,14 @@
                                    OUTUPT
         -----------------------------------------------------------------------
         The <output table> ('output_table' above) has the following columns
+                - covariate     TEXT. The names of independent variables
                 - rho           FLOAT8[]. Vector of the correlation coefficients between
                                       survival time and the scaled Schoenfeld residuals
                 - chi_square    FLOAT8[]. Chi-square test statistic for the correlation analysis
                 - p_value       FLOAT8[]. Two-side p-value for the chi-square statistic
 
         The output residual table is named as <output_table>_residual has the following columns
-                - time              FLOAT8. Time values present in the original source table.
+                - <dep_column_name> FLOAT8. Time values (dependent variable) present in the original source table.
                 - residual          FLOAT8[]. Difference between the original covariate value and the
                                           expectation of the covariate obtained from the coxph model.
                 - scaled_reisdual   FLOAT8[]. Residual values scaled by the variance of the coefficients
@@ -926,8 +933,8 @@
     else:
         partition_str = ''
 
-    coef = madvec(plpy.execute("SELECT coef FROM {table} ".format(
-                                    table=cox_output_table))[0]["coef"],
+    coef = madvec(plpy.execute("SELECT coef FROM {table} ".
+                               format(table=cox_output_table))[0]["coef"],
                   text=False)
     coef_str = "ARRAY" + str(coef)
     # We don't extract a copy of the Hessian 2D array, since Postgres/GPDB still

diff --git a/src/ports/postgres/modules/stats/cox_prop_hazards.sql_in b/src/ports/postgres/modules/stats/cox_prop_hazards.sql_in
index 918258b..6b54531 100644
--- a/src/ports/postgres/modules/stats/cox_prop_hazards.sql_in
+++ b/src/ports/postgres/modules/stats/cox_prop_hazards.sql_in

@@ -90,7 +90,7 @@
     </dd>
     <dd> Additionally, a summary output table is generated that contains
     a summary of the parameters used for building the Cox model. It is stored
-    in a table named <em>output_table</em>_summary. It has the following columns:
+    in a table named <output_table>_summary. It has the following columns:
     <table class="output">
         <tr>
             <th>source_table</th>
@@ -112,6 +112,14 @@
             <th>strata</th>
             <td>The stratification columns</td>
         </tr>
+        <tr>
+            <th>num_processed</th>
+            <td>The number of rows that were actually used in the computation.</td>
+        </tr>
+        <tr>
+            <th>num_missing_rows_skipped</th>
+            <td>The number of rows that were skipped in the computation due the NULL values in them.</td>
+        </tr>
     </table>
     </dd>
 
@@ -132,7 +140,7 @@
     <dt>strata (optional)</dt>
     <dd>VARCHAR, default: NULL, which does not do any stratifications. A string of comma-separated column names that are the strata ID variables used to do stratification.</dd>
     <dt>optimizer_params (optional)</dt>
-    <dd>VARCHAR, default: NULL, which uses the default values of optimizer parameters: max_iter=20, optimizer='newton', tolerance=1e-4. It should be a string that contains pairs of 'key=value' separated by commas.</dd>
+    <dd>VARCHAR, default: NULL, which uses the default values of optimizer parameters: max_iter=20, optimizer=newton, tolerance=1e-4. It should be a string that contains pairs of 'key=value' separated by commas.</dd>
 </dl>
 
 @anchor cox_zph
@@ -151,9 +159,7 @@
 
 Following is the syntax for the cox_zph() function:
 <pre class="syntax">
-cox_zph( cox_model_table,
-         output_table
-       )
+cox_zph(cox_model_table, output_table)
 </pre>
 \b Arguments
 <dl class="arglist">
@@ -166,6 +172,10 @@
         the following columns:
         <table class="output">
             <tr>
+                <th>covariate</th>
+                <td>TEXT. The independent variables.</td>
+            </tr>
+            <tr>
                 <th>rho</th>
                 <td>FLOAT8[]. Vector of the correlation coefficients between
                 survival time and the scaled Schoenfeld residuals.</td>
@@ -186,8 +196,8 @@
 The table contains the following columns:
         <table class="output">
             <tr>
-                <th>time</th>
-                <td>FLOAT8. Time values present in the original source table. </td>
+                <th><dep_column_name></th>
+                <td>FLOAT8. Time values (dependent variable) present in the original source table. </td>
             </tr>
             <tr>
                 <th>residual</th>
commit	24d9fba9173213f7ac77c66a4c557a2917bd19b2	[log] [tgz]
author	Rahul Iyer <riyer@gopivotal.com>	Mon Nov 25 12:26:48 2013 -0800
committer	Rahul Iyer <riyer@gopivotal.com>	Mon Nov 25 12:26:48 2013 -0800
tree	6890b0ea2cd7da014d6319ebef478dd50bde02db
parent	5480d2f4fed556c2da0d0ac8ff4c4352dbb58a23 [diff]