src/ports/postgres/modules/regress/clustered_variance.py_in - madlib - Git at Google


 # ------------------------------------------------------------------------
 # Compute clustered errors
 # ------------------------------------------------------------------------

 import plpy
 from utilities.utilities import unique_string
 from utilities.utilities import _string_to_array_with_quotes
 from utilities.utilities import extract_keyvalue_params
 from utilities.validate_args import columns_exist_in_table
 from utilities.utilities import rename_table
 from utilities.validate_args import table_is_empty
 from utilities.validate_args import table_exists
 from utilities.utilities import _assert
 from utilities.utilities import add_postfix

 # ========================================================================


 def __prepare_strings(clustervar, grouping_col):
     """
     Prepare the needed strings
     """
     fitres = unique_string()  # result table for linear regression

     if grouping_col is None:
         grouping_col_str = "NULL"
     else:
         grouping_col_str = "'" + grouping_col + "'"

     coef_str = unique_string()
     if clustervar is None:
         cluster_grouping_str = "group by {coef_str}".format(coef_str=coef_str)
     else:
         cluster_grouping_str = "group by {coef_str}, ".format(coef_str=coef_str) + clustervar

     return (fitres, coef_str, cluster_grouping_str, grouping_col_str)
 # ------------------------------------------------------------------------


 def __generate_clustered_sql(**kwargs):
     """
     Create the SQL query to execute and create the result table
     """
     sqlPart1 = """
     create table {out_table} as
         select (f).* from (
             select {schema_madlib}.__clustered_{regr_type}_compute_stats(
                 max(coef),
                 m4_ifdef(`__POSTGRESQL__', `{schema_madlib}.__array_')sum(meatvec),
                 m4_ifdef(`__POSTGRESQL__', `{schema_madlib}.__array_')sum(breadvec),
                 count(numRows)::integer, (sum(numRows))::integer) as f
             from (
                 select (g).*, coef, numRows from (
                     select """.format(**kwargs)

     if kwargs['regr_type'] == 'mlog':
     #We need to know the reference category and number of categories if doing the mlog regression
         sqlPart2 = """{schema_madlib}.__clustered_err_{regr_type}_step(
                                 ({depvar})::INTEGER,
                                 ({indvar})::DOUBLE PRECISION[],
                                 ({coef_str})::DOUBLE PRECISION[],
                                 (select count(distinct {depvar})
                                  from {source_table})::INTEGER,
                                 ({ref_category})::INTEGER) as g,""".format(**kwargs)
     elif kwargs['regr_type'] == 'log':
         sqlPart2 = """{schema_madlib}.__clustered_err_{regr_type}_step(
                         ({depvar})::BOOLEAN,
                         ({indvar})::DOUBLE PRECISION[],
                         ({coef_str})::DOUBLE PRECISION[]) as g,""".format(**kwargs)
     else:
         sqlPart2 = """{schema_madlib}.__clustered_err_{regr_type}_step(
                 {depvar},
                 ({indvar})::DOUBLE PRECISION[],
                 ({coef_str})::DOUBLE PRECISION[]) as g,""".format(**kwargs)

     sqlPart3 = """{coef_str} as coef,
                 count({depvar}) as numRows
                 FROM (
                       SELECT u.coef as {coef_str}, v.*
                       FROM
                         {fitres} u, {source_table} v
                       where
                          ({indvar}) is not NULL and
                          {schema_madlib}.array_contains_null({indvar}) is False and
                          ({depvar}) is not NULL
                     ) s
                     {cluster_grouping_str}) t) p) q
             """.format(**kwargs)

     return sqlPart1 + sqlPart2 + sqlPart3
 # ========================================================================


 def clustered_variance_linregr(schema_madlib, source_table, out_table,
                                depvar, indvar, clustervar, grouping_col,
                                **kwargs):
     """
     Linear regression clustered standard errors
     """
     old_msg_level = plpy.execute("select setting from pg_settings where \
                                  name='client_min_messages'")[0]['setting']
     plpy.execute("set client_min_messages to error")

     validate_args_clustered_variance_linregr(schema_madlib, source_table,
                                              out_table, depvar, indvar,
                                              clustervar, grouping_col)

     (fitres, coef_str, cluster_grouping_str,
      grouping_col_str) = __prepare_strings(clustervar, grouping_col)

     plpy.execute(
         """
         select {schema_madlib}.linregr_train('{source_table}',
             '{fitres}', '{depvar}', '{indvar}', {grouping_col})
         """.format(schema_madlib=schema_madlib, source_table=source_table,
                    fitres=fitres, depvar=depvar, indvar=indvar,
                    grouping_col=grouping_col_str))
     out_table_summary = add_postfix(out_table, "_summary")
     fitres_summary = add_postfix(fitres, "_summary")
     # Rename the output summary table
     rename_table(schema_madlib, fitres_summary, out_table_summary)

     plpy.execute("UPDATE {0} SET out_table = '{1}'".
                  format(out_table_summary, out_table))
     plpy.execute(
         __generate_clustered_sql(schema_madlib=schema_madlib, depvar=depvar,
                                  indvar=indvar, coef_str=coef_str,
                                  source_table=source_table, fitres=fitres,
                                  cluster_grouping_str=cluster_grouping_str,
                                  out_table=out_table, regr_type="lin"))

     plpy.execute("drop table if exists {fitres}".format(fitres=fitres))

     plpy.execute("set client_min_messages to " + old_msg_level)
     return None
 # ========================================================================


 def validate_args_clustered_variance(schema_madlib, source_table,
                                      out_table, depvar, indvar,
                                      clustervar, grouping_col):
     """
     Validate the parameters
     """
     if not source_table or source_table.lower() in ('null', '') or not table_exists(source_table):
         plpy.error("Clustered variance estimation error: Data table does not exist!")

     if table_is_empty(source_table):
         plpy.error("Clustered variance estimation error: Data table is empty!")

     if out_table is None or out_table.lower() in ('null', ''):
         plpy.error("Clustered variance estimation error: Invalid output table name!")

     if table_exists(out_table, only_first_schema=True):
         plpy.error("Clustered variance estimation error: Output table exists!")

     if table_exists(add_postfix(out_table, "_summary"), only_first_schema=True):
         plpy.error("Clustered variance estimation error: Output summary table exists!")

     if depvar is None or (not isinstance(depvar, str)) or depvar.lower() in ('null', ''):
         plpy.error("Clustered variance estimation error: Invalid dependent column name!")

     if indvar is None or (not isinstance(indvar, str)) or indvar.lower() in ('null', ''):
         plpy.error("Clustered variance estimation error: Invalid independent column name!")

     # try:
     #     plpy.execute("select {indvar}".format(indvar = indvar))
     #     isconst = True
     # except:
     #     isconst = False
     # if isconst:
     #     plpy.error("Clustered variance estimation error: independent variable is a constant!")

     # try:
     #     plpy.execute("select {depvar}".format(depvar = depvar))
     #     notconst = False
     # except:
     #     notconst = True
     # if notconst is False:
     #     plpy.error("Clustered variance estimation error: dependent variable is a constant!")

     # try:
     #     plpy.execute("select {indvar} from {tbl} limit 1".format(indvar = indvar, tbl = source_table))
     #     success = True
     # except:
     #     success = False
     # if success is False:
     #     plpy.error("Clustered variance estimation error: independent variable does not exist in the data table!")

     # try:
     #     plpy.execute("select {depvar} from {tbl} limit 1".format(depvar = depvar, tbl = source_table))
     #     success = True
     # except:
     #     success = False
     # if not success:
     #     plpy.error("Clustered variance estimation error: dependent variable does not exist in the data table!")

     # if not scalar_col_has_no_null(source_table, depvar):
     #     plpy.error("Clustered variance estimation error: Dependent variable has Null values! \
     #                 Please filter out Null values before using this function!")

     if clustervar is None or clustervar.lower() in ('null', ''):
         # clustervar is optional but if provided should be valid column name
         plpy.error("Clustered variance estimation error: Invalid cluster columns name!")

     if clustervar is not None:
         if not columns_exist_in_table(source_table,
                                       _string_to_array_with_quotes(clustervar),
                                       schema_madlib):
             plpy.error("Clustered variance estimation error: Cluster column does not exist!")

     if grouping_col is not None and grouping_col.lower() in ('null', ''):
         # grouping_col is optional but if provided should be valid column name
         plpy.error("Clustered variance estimation error: Invalid grouping columns name!")

     if grouping_col:
         if not columns_exist_in_table(source_table,
                                       _string_to_array_with_quotes(grouping_col),
                                       schema_madlib):
             plpy.error("Clustered variance estimation error: Grouping column does not exist!")


 # ========================================================================

 def validate_args_clustered_variance_linregr(schema_madlib, source_table,
                                              out_table, depvar, indvar,
                                              clustervar, grouping_col):
     """
     Validate the parameters
     """
     validate_args_clustered_variance(schema_madlib, source_table,
                                      out_table, depvar, indvar,
                                      clustervar, grouping_col)
 # ========================================================================


 def clustered_variance_linregr_help(schema_madlib, msg=None, **kwargs):
     """
     Print help messages
     """
     if msg is None or msg.strip(' ') == 'help' or msg.strip(' ') == '?':
         return """
         ----------------------------------------------------------------
                                 Summary
         ----------------------------------------------------------------
         Computes the clustered standard errors for linear regression.

         The function first runs a linear regression to get the fitting
         coefficients. Then it computes the clustered standard errors for
         the linear regression.

         SELECT {schema_madlib}.clustered_variance_linregr(
             'source_table',
             'out_table',
             'depvar',
             'indvar',
             'clustervar',
             'grouping_col'
         );

         --
         Run:
         SELECT {schema_madlib}.clustered_variance_linregr('usage');

         to get more information.
         """.format(schema_madlib=schema_madlib)

     if msg.strip(' ') == 'usage':
         return """
         Usage:
         ----------------------------------------------------------------
         SELECT {schema_madlib}.clustered_variance_linregr(
             'source_table',      -- Name of data table
             'out_table',    -- Name of result table (raise an error if it already exists)
             'depvar',        -- Expression for dependent variable
             'indvar',        -- Expression for independent variables
             'clustervar',    -- Column names for cluster variables, separated by comma
             'grouping_col'   -- Grouping regression column names, separated by comma, default NULL
         );

         Output:
         ----------------------------------------------------------------
         The output table has the following columns:
             coef      DOUBLE PRECISION[],  -- Fitting coefficients
             std_err   DOUBLE PRECISION[],  -- Clustered standard errors for coef
             t_stats   DOUBLE PRECISION[],  -- t-stats of the errors
             p_values  DOUBLE PRECISION[]   -- p-values of the errors

         The output summary table is the same as linregr_train(), see also:
             SELECT linregr_train('usage');
         """.format(schema_madlib=schema_madlib)

 # ========================================================================
 # Clustered errors for logistic regression
 # ========================================================================


 def clustered_variance_logregr(schema_madlib, source_table, out_table,
                                depvar, indvar, clustervar, grouping_col,
                                max_iter, optimizer, tolerance, verbose_mode,
                                **kwargs):
     """
     Logistic regression clustered standard errors
     """
     if optimizer is not None and optimizer.lower() == 'newton':
         optimizer = 'irls'

     validate_args_clustered_variance_logregr(schema_madlib, source_table,
                                              out_table, depvar, indvar,
                                              clustervar, grouping_col,
                                              max_iter, optimizer, tolerance,
                                              verbose_mode)

     (fitres, coef_str, cluster_grouping_str,
      grouping_col_str) = __prepare_strings(clustervar, grouping_col)

     plpy.execute(
         """
         SELECT {schema_madlib}.logregr_train(
             '{source_table}', '{fitres}', '{depvar}', '{indvar}',
             {grouping_col}, {max_iter}, '{optimizer}', {tolerance},
             {verbose})
         """.format(schema_madlib=schema_madlib, source_table=source_table,
                    fitres=fitres, depvar=depvar, indvar=indvar,
                    grouping_col=grouping_col_str, max_iter=max_iter,
                    optimizer=optimizer, tolerance=tolerance,
                    verbose=verbose_mode))

     old_msg_level = plpy.execute("select setting from pg_settings where \
                                  name='client_min_messages'")[0]['setting']
     if verbose_mode:
         plpy.execute("set client_min_messages to warning")
     else:
         plpy.execute("set client_min_messages to error")

     out_table_summary = add_postfix(out_table, "_summary")
     fitres_summary = add_postfix(fitres, "_summary")
     # Rename the output summary table
     rename_table(schema_madlib, fitres_summary, out_table_summary)

     plpy.execute("""UPDATE {0} SET out_table = '{1}'
                  """.format(out_table_summary, out_table))
     plpy.execute(
         __generate_clustered_sql(schema_madlib=schema_madlib, depvar=depvar,
                                  indvar=indvar, coef_str=coef_str,
                                  source_table=source_table, fitres=fitres,
                                  cluster_grouping_str=cluster_grouping_str,
                                  out_table=out_table, regr_type="log"))

     plpy.execute("DROP TABLE IF EXISTS {fitres}".format(fitres=fitres))
     plpy.execute("SET client_min_messages TO " + old_msg_level)

     return None
 # ========================================================================


 def validate_args_clustered_variance_logregr(schema_madlib, source_table,
                                              out_table, depvar, indvar,
                                              clustervar, grouping_col,
                                              max_iter, optimizer, tolerance,
                                              verbose_mode):
     """
     Validate the parameters
     """
     validate_args_clustered_variance(schema_madlib, source_table,
                                      out_table, depvar, indvar,
                                      clustervar, grouping_col)
     if max_iter is None or max_iter <= 0:
         plpy.error("Clustered variance estimation error: Maximum number of "
                    "iterations must be a positive number!")

     if tolerance is None or tolerance < 0:
         plpy.error("Clustered variance estimation error: The tolerance must be "
                    "a non-negative number!")

     if not isinstance(verbose_mode, bool):
         plpy.error("Clustered variance estimation error: verbose_mode option must "
                    "be a boolean!")

     if optimizer not in ("irls", "cg", "igd"):
         plpy.error(""" Clustered variance estimation error: Unknown optimizer requested.
                    Must be 'newton'/'irls', 'cg', or 'igd'.
                    """)
 # ========================================================================


 def clustered_variance_logregr_help(schema_madlib, msg=None, **kwargs):
     """
     Print help messages
     """
     if msg is None or msg.strip(' ') == 'help' or msg.strip(' ') == '?':
         return """
         ----------------------------------------------------------------
                                 Summary
         ----------------------------------------------------------------
         Computes the clustered standard errors for logistic regression.

         The function first runs a logistic regression to get the fitting
         coefficients. Then it computes the clustered standard errors for
         the logistic regression.

         SELECT {schema_madlib}.clustered_variance_logregr(
             'source_table',
             'out_table',
             'depvar',
             'indvar',
             'clustervar',
             'grouping_col',
             max_iter,
             'optimizer',
             tolerance,
             verbose_mode
         );

         --
         Run:
         SELECT {schema_madlib}.clustered_variance_logregr('usage');

         to get more information.
         """.format(schema_madlib=schema_madlib)

     if msg.strip(' ') == 'usage':
         return """
         Usage:
         ----------------------------------------------------------------
         SELECT {schema_madlib}.clustered_variance_logregr(
             'source_table',      -- Name of data table
             'out_table',    -- Name of result table (raise an error if it already exists)
             'depvar',        -- Expression for dependent variable
             'indvar',        -- Expression for independent variables
             'clustervar',    -- Column names for cluster variables, separated by comma
             'grouping_col',  -- Grouping regression column names, separated by comma, default NULL
             max_iter,        -- Maximum iteration number for logistic regression, default 20
             'optimizer',     -- Optimization method for logistic regression, default 'irls'
             tolerance,       -- When difference of likelihoods in two consecutive iterations smaller than
                              -- this value, stops the computation. Default 0.0001
             verbose_mode     -- Whether print detailed information when computing logistic regression,
                              -- default is False
         );

         Output:
         ----------------------------------------------------------------
         The output table has the following columns:
             coef      DOUBLE PRECISION[],  -- Fitting coefficients
             std_err   DOUBLE PRECISION[],  -- Clustered standard errors for coef
             z_stats   DOUBLE PRECISION[],  -- z-stats of the errors
             p_values  DOUBLE PRECISION[]   -- p-values of the errors

         The output summary table is the same as logregr_train(), see also:
             SELECT logregr_train('usage');
         """.format(schema_madlib=schema_madlib)


 # ========================================================================
 # Clustered errors for multi-logistic regression
 # ========================================================================

 def clustered_variance_mlogregr(
         schema_madlib, source_table, out_table, dependent_varname,
         independent_varname, cluster_varname, ref_category,
         grouping_cols=None, optimizer_params=None, verbose_mode=None,
         **kwargs):
     """
     Multi-Logistic regression clustered standard errors
     """

     old_msg_level = plpy.execute("select setting from pg_settings where \
                                  name='client_min_messages'")[0]['setting']
     plpy.execute("set client_min_messages to error")

     # extract optimizer parameters
     allowed_param_types = {
         'max_iter': int,
         'max_num_iterations': int,
         'optimizer': str,
         'tolerance': float,
         'precision': float}
     default_optimizer_values = {
         'max_iter': 20,
         'optimizer': 'irls',
         'tolerance': 0.0001}

     optimizer_param_dict = extract_keyvalue_params(
         optimizer_params, allowed_param_types, default_optimizer_values)

     if not optimizer_param_dict:
         optimizer_param_dict = default_optimizer_values
     if 'precision' in optimizer_param_dict and \
             'tolerance' not in optimizer_param_dict:
         optimizer_param_dict['tolerance'] = optimizer_param_dict['precision']
     if 'max_num_iterations' in optimizer_param_dict and \
             'max_iter' not in optimizer_param_dict:
         optimizer_param_dict['max_iter'] = optimizer_param_dict['max_num_iterations']

     if optimizer_param_dict['optimizer'].lower() == 'newton':
         optimizer_param_dict['optimizer'] = 'irls'
     validate_args_clustered_variance_mlogregr(
         schema_madlib, source_table, out_table, dependent_varname,
         independent_varname, cluster_varname, grouping_cols, ref_category,
         verbose_mode, **optimizer_param_dict)

     (fitres, coef_str, cluster_grouping_str, grouping_col_str) = __prepare_strings(cluster_varname, grouping_cols)

     plpy.execute(
         """
         SELECT
             {schema_madlib}.mlogregr_train(
                 '{source_table}', '{fitres}',
                 '{dependent_varname}', '{independent_varname}', {ref_category},
                 'max_iter={max_iter}, optimizer={optimizer}, tolerance={tolerance}')
         """.format(schema_madlib=schema_madlib, source_table=source_table,
                    fitres=fitres, dependent_varname=dependent_varname,
                    independent_varname=independent_varname,
                    ref_category=ref_category, **optimizer_param_dict))

     out_table_summary = add_postfix(out_table, "_summary")
     fitres_summary = add_postfix(fitres, "_summary")
     # Rename the output summary table
     rename_table(schema_madlib, fitres_summary, out_table_summary)

     plpy.execute("UPDATE {0} SET out_table = '{1}'".
                  format(out_table_summary, out_table))

     coef_table = unique_string()
     plpy.execute("""
         CREATE TEMP TABLE {coef_table} AS
         SELECT
             {schema_madlib}.matrix_agg(coef ORDER BY category) AS coef
         FROM {fitres}
         """.format(schema_madlib=schema_madlib, fitres=fitres, coef_table=coef_table))

     plpy.execute(
         """
         drop table if exists {fitres}
         """.format(fitres=fitres))

     plpy.execute(
         __generate_clustered_sql(
             schema_madlib=schema_madlib,
             depvar=dependent_varname, indvar=independent_varname,
             coef_str=coef_str, source_table=source_table, fitres=coef_table,
             cluster_grouping_str=cluster_grouping_str, out_table=out_table,
             ref_category=ref_category, regr_type="mlog"))

     num_categories = plpy.execute(
         "SELECT count(DISTINCT {0}) as n_cat FROM {1}".
         format(dependent_varname, source_table))[0]['n_cat']
     num_features = plpy.execute("""
         SELECT
             array_upper({independent_varname}, 1) fnum
         FROM {source_table} LIMIT 1
         """.format(independent_varname=independent_varname, source_table=source_table))[0]['fnum']

     tmp_table = unique_string()
     plpy.execute("""
         CREATE TABLE {tmp_table} AS
         SELECT
             ({schema_madlib}.__mlogregr_format(
                 coef, {num_features},
                 {num_categories}, {ref_category})
             ).category AS category,
             {ref_category} as ref_category,
             ({schema_madlib}.__mlogregr_format(
                 coef, {num_features},
                 {num_categories}, {ref_category})
             ).coef AS coef,
             ({schema_madlib}.__mlogregr_format(
                 std_err, {num_features},
                 {num_categories}, {ref_category})
             ).coef AS std_err,
             ({schema_madlib}.__mlogregr_format(
                 z_stats, {num_features},
                 {num_categories}, {ref_category})
             ).coef AS z_stats,
             ({schema_madlib}.__mlogregr_format(
                 p_values, {num_features},
                 {num_categories}, {ref_category})
             ).coef AS p_values
         FROM
             {out_table};
         DROP TABLE IF EXISTS {out_table};
         """.format(schema_madlib=schema_madlib, tmp_table=tmp_table,
                    num_features=num_features, num_categories=num_categories,
                    ref_category=ref_category, out_table=out_table))

     # Rename the output table
     rename_table(schema_madlib,
                  "{old}".format(old=tmp_table),
                  "{new}".format(new=out_table))

     plpy.execute("set client_min_messages to " + old_msg_level)
     return None
 # ========================================================================


 def validate_args_clustered_variance_mlogregr(
         schema_madlib, source_table, output_table, dependent_varname,
         independent_varname, cluster_varname, grouping_col, ref_category,
         verbose_mode, max_iter, optimizer, tolerance, **kwargs):
     """
     Validate the parameters
     """
     validate_args_clustered_variance(
         schema_madlib, source_table, output_table, dependent_varname,
         independent_varname, cluster_varname, grouping_col)

     if ref_category is None:
         plpy.error("Clustered variance error: Reference category "
                    "cannot be null!")

     if ref_category < 0:
         plpy.error("Clustered variance error: Reference category "
                    "cannot be negative!")

     if not isinstance(verbose_mode, bool):
         plpy.error("Clustered variance error: verbose_mode option "
                    "must be a boolean!")

     if max_iter <= 0:
         plpy.error("Clustered variance error: Maximum number of "
                    "iterations must be positive!")

     if tolerance < 0:
         plpy.error("Clustered variance error: The tolerance cannot "
                    "be negative!")

     if optimizer not in ("newton", "irls"):
         plpy.error(""" Clustered variance error: Unknown optimizer
                        requested. Must be 'newton'/'irls'.
                    """)

     result_w_null = plpy.execute("""
         SELECT DISTINCT {dep} AS cat
         FROM {source}
         WHERE {dep} is not NULL
         """.format(source=source_table,
                    indep=independent_varname,
                    dep=dependent_varname))
     result_wo_null = plpy.execute("""
         SELECT DISTINCT {dep} AS cat
         FROM {source}
         WHERE {dep} is not NULL
             AND NOT {madlib}.array_contains_null({indep})
         """.format(madlib=schema_madlib, source=source_table,
                    indep=independent_varname,
                    dep=dependent_varname))

     categories_wo_null = set(i["cat"] for i in result_wo_null)
     categories_w_null = set(i["cat"] for i in result_w_null)

     _assert(categories_wo_null == categories_w_null,
             "Clustered variance error: All observations of category set {0} contain "
             "NULL values. These rows should be removed from the dataset "
             "before proceeding.".
             format(list(categories_w_null - categories_wo_null)))
 # ========================================================================


 def clustered_variance_mlogregr_help(schema_madlib, msg=None, **kwargs):
     """
     Print help messages
     """
     if msg is None or msg.strip(' ') in ('help', '?'):
         return """
         ----------------------------------------------------------------
                                 Summary
         ----------------------------------------------------------------
         Computes the clustered standard errors for multi-logistic regression.

         The function first runs a multi-logistic regression to get the fitting
         coefficients. Then it computes the clustered standard errors for
         the multi-logistic regression.

         SELECT {schema_madlib}.clustered_variance_mlogregr(
             'source_table',
             'out_table',
             'depvar',
             'indvar',
             'clustervar',
             'ref_category',
             'grouping_col',
             max_iter,
             'optimizer',
             tolerance,
             verbose_mode
         );

         --
         Run:
         SELECT {schema_madlib}.clustered_variance_mlogregr('usage');

         to get more information.
         """.format(schema_madlib=schema_madlib)

     if msg.strip(' ') == 'usage':
         return """
         Usage:
         ----------------------------------------------------------------
         SELECT {schema_madlib}.clustered_variance_mlogregr(
             'source_table',      -- Name of data table
             'out_table',    -- Name of result table (raise an error if it already exists)
             'depvar',        -- Expression for dependent variable
             'indvar',        -- Expression for independent variables
             'clustervar',    -- Column names for cluster variables, separated by comma
             'ref_category',  -- Reference category for the multi-logistic regression
             'grouping_col',  -- Grouping regression column names, separated by comma, default NULL
             max_iter,        -- Maximum iteration number for logistic regression, default 20
             'optimizer',     -- Optimization method for logistic regression, default 'irls'
             tolerance,       -- When difference of likelihoods in two consecutive iterations smaller than
                              -- this value, stops the computation. Default 0.0001
             verbose_mode     -- Whether print detailed information when computing logistic regression,
                              -- default is False
         );

         Output:
         ----------------------------------------------------------------
         The output table has the following columns:
             ref_category    INTEGER              -- The reference category used in the multi-logistic regression
             coef            DOUBLE PRECISION[],  -- Fitting coefficients
             std_err         DOUBLE PRECISION[],  -- Clustered standard errors for coef
             z_stats         DOUBLE PRECISION[],  -- z-stats of the errors
             p_values        DOUBLE PRECISION[]   -- p-values of the errors

         The output summary table is the same as mlogregr_train(), see also:
             SELECT mlogregr_train('usage');
         """.format(schema_madlib=schema_madlib)