src/ports/postgres/modules/regress/clustered_variance.py_in - madlib - Git at Google


 # ------------------------------------------------------------------------
 # Compute clustered errors
 # ------------------------------------------------------------------------

 import plpy
 from utilities.utilities import __unique_string
 from utilities.utilities import _array_to_string
 from utilities.utilities import _string_to_array
 from utilities.validate_args import columns_exist_in_table
 from utilities.validate_args import table_is_empty
 from utilities.validate_args import scalar_col_has_no_null
 from utilities.validate_args import table_exists

 # ========================================================================

 def __prepare_strings(clustervar, grouping_col):
     """
     Prepare the needed strings
     """
     fitres = __unique_string() # result table for linear regression

     if grouping_col is None:
         grouping_col_str = "NULL"
     else:
         grouping_col_str = "'" + grouping_col + "'"

     coef_str = __unique_string()
     if clustervar is None:
         cluster_grouping_str = "group by {coef_str}".format(coef_str = coef_str)
     else:
         cluster_grouping_str = "group by {coef_str}, ".format(coef_str = coef_str) + clustervar

     return (fitres, coef_str, cluster_grouping_str, grouping_col_str)

 # ------------------------------------------------------------------------

 def __generate_clustered_sql (**kwargs):
     """
     Create the SQL query to execute and create the result table
     """


     sqlPart1 = """
     create table {tbl_output} as
         select (f).* from (
             select {schema_madlib}.__clustered_{regr_type}_compute_stats(
                 max(coef),
                 m4_ifdef(`__POSTGRESQL__', `{schema_madlib}.__array_')sum(meatvec),
                 m4_ifdef(`__POSTGRESQL__', `{schema_madlib}.__array_')sum(breadvec),
                 count(numRows)::integer, (sum(numRows))::integer) as f
             from (
                 select (g).*, coef, numRows from (
                     select""".format(**kwargs)

     if(kwargs['regr_type'] == 'mlog'):   #We need to know the reference category and number of categories if doing the mlog regression
        sqlPart2="""                {schema_madlib}.__clustered_err_{regr_type}_step({depvar},
             {indvar}, {coef_str}, (select count(distinct {depvar}) from {tbl_data})::INTEGER, ({ref_category})::INTEGER ) as g,""".format(**kwargs)
     else:
        sqlPart2="""                {schema_madlib}.__clustered_err_{regr_type}_step({depvar},
             {indvar}, {coef_str}) as g,""".format(**kwargs)

     sqlPart3="""          {coef_str} as coef,
                         count({depvar}) as numRows
                     from (
                         select u.coef as {coef_str}, v.*
                         from
                             {fitres} u, {tbl_data} v
                     ) s
                     {cluster_grouping_str}) t) p) q
     """.format(**kwargs)
     return sqlPart1 + sqlPart2 + sqlPart3


 # ========================================================================

 def clustered_variance_linregr (schema_madlib, tbl_data, tbl_output,
                                 depvar, indvar, clustervar, grouping_col,
                                 **kwargs):
     """
     Linear regression clustered standard errors
     """
     old_msg_level = plpy.execute("select setting from pg_settings where \
                                  name='client_min_messages'")[0]['setting']
     plpy.execute("set client_min_messages to error")

     validate_args_clustered_variance_linregr(schema_madlib, tbl_data,
                                              tbl_output, depvar, indvar,
                                              clustervar, grouping_col)

     (fitres, coef_str, cluster_grouping_str,
      grouping_col_str) = __prepare_strings(clustervar, grouping_col)

     plpy.execute(
         """
         select {schema_madlib}.linregr_train('{tbl_data}',
             '{fitres}', '{depvar}', '{indvar}', {grouping_col})
         """.format(schema_madlib = schema_madlib, tbl_data = tbl_data,
                    fitres = fitres, depvar = depvar, indvar = indvar,
                    grouping_col = grouping_col_str))

     plpy.execute(
         __generate_clustered_sql(schema_madlib = schema_madlib, depvar = depvar,
                                  indvar = indvar, coef_str = coef_str,
                                  tbl_data = tbl_data, fitres = fitres,
                                  cluster_grouping_str = cluster_grouping_str,
                                  tbl_output = tbl_output, regr_type = "lin"))

     plpy.execute(
         """
         drop table if exists {fitres}
         """.format(fitres = fitres))

     plpy.execute("set client_min_messages to " + old_msg_level)
     return None

 # ========================================================================

 def validate_args_clustered_variance (schema_madlib, tbl_data,
                                       tbl_output, depvar, indvar,
                                       clustervar, grouping_col):
     """
     Validate the parameters
     """
     if not tbl_data or tbl_data in ('null', '') or not table_exists(tbl_data):
         plpy.error("Clustered variance estimation error: Data table does not exist!")

     if table_is_empty(tbl_data):
         plpy.error("Clustered variance estimation error: Data table is empty!")

     if tbl_output is None or tbl_output.lower() in ('null', ''):
         plpy.error("Clustered variance estimation error: Invalid output table name!")

     if table_exists(tbl_output):
         plpy.error("Clustered variance estimation error: Output table exists!")

     if depvar is None or (not isinstance(depvar, str)) or depvar.lower() in ('null', ''):
         plpy.error("Clustered variance estimation error: Invalid dependent column name!")

     if indvar is None or (not isinstance(indvar, str)) or indvar.lower() in ('null', ''):
         plpy.error("Clustered variance estimation error: Invalid independent column name!")

     # try:
     #     plpy.execute("select {indvar}".format(indvar = indvar))
     #     isconst = True
     # except:
     #     isconst = False
     # if isconst:
     #     plpy.error("Clustered variance estimation error: independent variable is a constant!")

     # try:
     #     plpy.execute("select {depvar}".format(depvar = depvar))
     #     notconst = False
     # except:
     #     notconst = True
     # if notconst is False:
     #     plpy.error("Clustered variance estimation error: dependent variable is a constant!")

     # try:
     #     plpy.execute("select {indvar} from {tbl} limit 1".format(indvar = indvar, tbl = tbl_data))
     #     success = True
     # except:
     #     success = False
     # if success is False:
     #     plpy.error("Clustered variance estimation error: independent variable does not exist in the data table!")

     # try:
     #     plpy.execute("select {depvar} from {tbl} limit 1".format(depvar = depvar, tbl = tbl_data))
     #     success = True
     # except:
     #     success = False
     # if not success:
     #     plpy.error("Clustered variance estimation error: dependent variable does not exist in the data table!")

     if not scalar_col_has_no_null(tbl_data, depvar):
         plpy.error("Clustered variance estimation error: Dependent variable has Null values! \
                     Please filter out Null values before using this function!")

     if clustervar is None or clustervar.lower() in ('null', ''):
         # clustervar is optional but if provided should be valid column name
         plpy.error("Clustered variance estimation error: Invalid cluster columns name!")

     if clustervar is not None:
         if not columns_exist_in_table(tbl_data,
                                 _string_to_array(clustervar), schema_madlib):
             plpy.error("Clustered variance estimation error: Cluster column does not exist!")

     if grouping_col is not None and grouping_col.lower() in ('null', ''):
         # grouping_col is optional but if provided should be valid column name
         plpy.error("Clustered variance estimation error: Invalid grouping columns name!")

     if grouping_col:
         if not columns_exist_in_table(tbl_data,
                                 _string_to_array(grouping_col), schema_madlib):
             plpy.error("Clustered variance estimation error: Grouping column does not exist!")


 # ========================================================================

 def validate_args_clustered_variance_linregr(schema_madlib, tbl_data,
                                              tbl_output, depvar, indvar,
                                              clustervar, grouping_col):
     """
     Validate the parameters
     """
     validate_args_clustered_variance(schema_madlib, tbl_data,
                                      tbl_output, depvar, indvar,
                                      clustervar, grouping_col)

 # ========================================================================

 def clustered_variance_linregr_help (schema_madlib, msg = None, **kwargs):
     """
     Print help messages
     """
     if msg is None or msg.strip(' ') == 'help' or msg.strip(' ') == '?':
         return """
         ----------------------------------------------------------------
                                 Summary
         ----------------------------------------------------------------
         Computes the clustered standard errors for linear regression.

         The function first runs a linear regression to get the fitting
         coefficients. Then it computes the clustered standard errors for
         the linear regression.

         SELECT {schema_madlib}.clustered_variance_linregr(
             'tbl_data',
             'tbl_output',
             'depvar',
             'indvar',
             'clustervar',
             'grouping_col'
         );

         --
         Run:
         SELECT {schema_madlib}.clustered_variance_linregr('usage');

         to get more information.
         """.format(schema_madlib = schema_madlib)

     if msg.strip(' ') == 'usage':
         return """
         Usage:
         ----------------------------------------------------------------
         SELECT {schema_madlib}.clustered_variance_linregr(
             'tbl_data',      -- Name of data table
             'tbl_output',    -- Name of result table (raise an error if it already exists)
             'depvar',        -- Expression for dependent variable
             'indvar',        -- Expression for independent variables
             'clustervar',    -- Column names for cluster variables, separated by comma
             'grouping_col'   -- Grouping regression column names, separated by comma, default NULL
         );

         Output:
         ----------------------------------------------------------------
         The output table has the following columns:
             coef      DOUBLE PRECISION[],  -- Fitting coefficients
             std_err   DOUBLE PRECISION[],  -- Clustered standard errors for coef
             t_stats   DOUBLE PRECISION[],  -- t-stats of the errors
             p_values  DOUBLE PRECISION[]   -- p-values of the errors
         """.format(schema_madlib = schema_madlib)

 # ========================================================================
 # Clustered errors for logistic regression
 # ========================================================================

 def clustered_variance_logregr (schema_madlib, tbl_data, tbl_output,
                                 depvar, indvar, clustervar, grouping_col,
                                 max_iter, optimizer, tolerance, verbose,
                                 **kwargs):
     """
     Logistic regression clustered standard errors
     """
     validate_args_clustered_variance_logregr(schema_madlib, tbl_data,
                                              tbl_output, depvar, indvar,
                                              clustervar, grouping_col,
                                              max_iter, optimizer, tolerance,
                                              verbose)

     (fitres, coef_str, cluster_grouping_str,
      grouping_col_str) = __prepare_strings(clustervar, grouping_col)

     plpy.execute(
         """
         select {schema_madlib}.logregr_train('{tbl_data}',
             '{fitres}', '{depvar}', '{indvar}', {grouping_col},
             {max_iter}, '{optimizer}', {tolerance}, {verbose})
         """.format(schema_madlib = schema_madlib, tbl_data = tbl_data,
                    fitres = fitres, depvar = depvar, indvar = indvar,
                    grouping_col = grouping_col_str, max_iter = max_iter,
                    optimizer = optimizer, tolerance = tolerance,
                    verbose = verbose))

     old_msg_level = plpy.execute("select setting from pg_settings where \
                                  name='client_min_messages'")[0]['setting']
     plpy.execute("set client_min_messages to error")

     plpy.execute(
         __generate_clustered_sql(schema_madlib = schema_madlib, depvar = depvar,
                                  indvar = indvar, coef_str = coef_str,
                                  tbl_data = tbl_data, fitres = fitres,
                                  cluster_grouping_str = cluster_grouping_str,
                                  tbl_output = tbl_output, regr_type = "log"))

     plpy.execute(
         """
         drop table if exists {fitres}
         """.format(fitres = fitres))

     plpy.execute("set client_min_messages to " + old_msg_level)
     return None

 # ========================================================================

 def validate_args_clustered_variance_logregr(schema_madlib, tbl_data,
                                              tbl_output, depvar, indvar,
                                              clustervar, grouping_col,
                                              max_iter, optimizer, tolerance,
                                              verbose):
     """
     Validate the parameters
     """
     validate_args_clustered_variance(schema_madlib, tbl_data,
                                      tbl_output, depvar, indvar,
                                      clustervar, grouping_col)
     if max_iter is None or max_iter <= 0:
         plpy.error("Clustered variance estimation error: Maximum number of iterations must be a positive number!")

     if tolerance is None or tolerance < 0:
         plpy.error("Clustered variance estimation error: The tolerance must be a non-negative number!")

     if verbose not in [True, False]:
         plpy.error("Clustered variance estimation error: verbose option must be a boolean!")

     if optimizer == "newton":
         optimizer = "irls"
     elif optimizer not in ("irls", "cg", "igd"):
         plpy.error(""" Clustered variance estimation error: Unknown optimizer requested.
                    Must be 'newton'/'irls', 'cg', or 'igd'.
                    """)

 # ========================================================================

 def clustered_variance_logregr_help (schema_madlib, msg = None, **kwargs):
     """
     Print help messages
     """
     if msg is None or msg.strip(' ') == 'help' or msg.strip(' ') == '?':
         return """
         ----------------------------------------------------------------
                                 Summary
         ----------------------------------------------------------------
         Computes the clustered standard errors for logistic regression.

         The function first runs a logistic regression to get the fitting
         coefficients. Then it computes the clustered standard errors for
         the logistic regression.

         SELECT {schema_madlib}.clustered_variance_logregr(
             'tbl_data',
             'tbl_output',
             'depvar',
             'indvar',
             'clustervar',
             'grouping_col',
             max_iter,
             'optimizer',
             tolerance,
             verbose
         );

         --
         Run:
         SELECT {schema_madlib}.clustered_variance_logregr('usage');

         to get more information.
         """.format(schema_madlib = schema_madlib)

     if msg.strip(' ') == 'usage':
         return """
         Usage:
         ----------------------------------------------------------------
         SELECT {schema_madlib}.clustered_variance_logregr(
             'tbl_data',      -- Name of data table
             'tbl_output',    -- Name of result table (raise an error if it already exists)
             'depvar',        -- Expression for dependent variable
             'indvar',        -- Expression for independent variables
             'clustervar',    -- Column names for cluster variables, separated by comma
             'grouping_col',  -- Grouping regression column names, separated by comma, default NULL
             max_iter,        -- Maximum iteration number for logistic regression, default 20
             'optimizer',     -- Optimization method for logistic regression, default 'irls'
             tolerance,       -- When difference of likelihoods in two consecutive iterations smaller than
                              -- this value, stops the computation. Default 0.0001
             verbose          -- Whether print detailed information when computing logistic regression,
                              -- default is False
         );

         Output:
         ----------------------------------------------------------------
         The output table has the following columns:
             coef      DOUBLE PRECISION[],  -- Fitting coefficients
             std_err   DOUBLE PRECISION[],  -- Clustered standard errors for coef
             z_stats   DOUBLE PRECISION[],  -- z-stats of the errors
             p_values  DOUBLE PRECISION[]   -- p-values of the errors
         """.format(schema_madlib = schema_madlib)


 # ========================================================================
 # Clustered errors for multi-logistic regression
 # ========================================================================

 #NOTE.  Because of the current mlogregr interface, the "verbose" parameter doesn't actually do anything at the moment
 def clustered_variance_mlogregr (schema_madlib, tbl_data, tbl_output,
                                 depvar, indvar, clustervar, ref_category, grouping_col,
                                 max_iter, optimizer, tolerance, verbose,
                                 **kwargs):
     """
     Multi-Logistic regression clustered standard errors
     """
     validate_args_clustered_variance_mlogregr(schema_madlib, tbl_data,
                                              tbl_output, depvar, indvar,
                                              clustervar, grouping_col,
                                              max_iter, optimizer, tolerance,
                                              verbose)

     (fitres, coef_str, cluster_grouping_str,
      grouping_col_str) = __prepare_strings(clustervar, grouping_col)

     old_msg_level = plpy.execute("select setting from pg_settings where \
                                  name='client_min_messages'")[0]['setting']
     plpy.execute("set client_min_messages to error")

     #The multi-logistic has a different interface than the rest of the regressions, so we have to create a table to hold the result
     plpy.execute("""
     create table {fitres}(coef float8[]);
     """.format(fitres = fitres))

     plpy.execute(
         """
         insert into {fitres} (select (select coef from {schema_madlib}.mlogregr('{tbl_data}',
             '{depvar}', '{indvar}',
             {max_iter}, '{optimizer}', {tolerance}, {ref_category})))
         """.format(schema_madlib = schema_madlib, tbl_data = tbl_data,
                    fitres = fitres, depvar = depvar, indvar = indvar,
                    max_iter = max_iter,
                    optimizer = optimizer, tolerance = tolerance,
                    ref_category = ref_category))


     plpy.execute(
         __generate_clustered_sql(schema_madlib = schema_madlib, depvar = depvar,
                                  indvar = indvar, coef_str = coef_str,
                                  tbl_data = tbl_data, fitres = fitres,
                                  cluster_grouping_str = cluster_grouping_str,
                                  tbl_output = tbl_output, ref_category=ref_category,
                                  regr_type = "mlog"))

     plpy.execute(
         """
         drop table if exists {fitres}
         """.format(fitres = fitres))

     ##Add in the reference category to the output column.
     plpy.execute(
     """
     ALTER table {tbl_output} add column ref_category INTEGER;
     """.format(tbl_output = tbl_output)
     )

     plpy.execute(
     """
     update {tbl_output} set ref_category={ref_category};
     """.format(tbl_output = tbl_output, ref_category=ref_category)
     )

     plpy.execute("set client_min_messages to " + old_msg_level)
     return None

 # ========================================================================

 def validate_args_clustered_variance_mlogregr(schema_madlib, tbl_data,
                                              tbl_output, depvar, indvar,
                                              clustervar, grouping_col,
                                              max_iter, optimizer, tolerance,
                                              verbose):
     """
     Validate the parameters
     """
     validate_args_clustered_variance(schema_madlib, tbl_data,
                                      tbl_output, depvar, indvar,
                                      clustervar, grouping_col)
     if max_iter <= 0:
         plpy.error("Clustered variance estimation error: Maximum number of iterations must be positive!")

     if tolerance < 0:
         plpy.error("Clustered variance estimation error: The tolerance cannot be negative!")

     if optimizer == "newton":
         optimizer = "irls"
     elif optimizer not in ("irls"):
         plpy.error(""" Clustered variance estimation error: Unknown optimizer requested.
                    Must be 'newton'/'irls'.
                    """)

 # ========================================================================

 def clustered_variance_mlogregr_help (schema_madlib, msg = None, **kwargs):
     """
     Print help messages
     """
     if msg is None or msg.strip(' ') == 'help' or msg.strip(' ') == '?':
         return """
         ----------------------------------------------------------------
                                 Summary
         ----------------------------------------------------------------
         Computes the clustered standard errors for multi-logistic regression.

         The function first runs a multi-logistic regression to get the fitting
         coefficients. Then it computes the clustered standard errors for
         the multi-logistic regression.

         SELECT {schema_madlib}.clustered_variance_mlogregr(
             'tbl_data',
             'tbl_output',
             'depvar',
             'indvar',
             'clustervar',
             'ref_category',
             'grouping_col',
             max_iter,
             'optimizer',
             tolerance,
             verbose
         );

         --
         Run:
         SELECT {schema_madlib}.clustered_variance_mlogregr('usage');

         to get more information.
         """.format(schema_madlib = schema_madlib)

     if msg.strip(' ') == 'usage':
         return """
         Usage:
         ----------------------------------------------------------------
         SELECT {schema_madlib}.clustered_variance_mlogregr(
             'tbl_data',      -- Name of data table
             'tbl_output',    -- Name of result table (raise an error if it already exists)
             'depvar',        -- Expression for dependent variable
             'indvar',        -- Expression for independent variables
             'clustervar',    -- Column names for cluster variables, separated by comma
             'ref_category',  -- Reference category for the multi-logistic regression
             'grouping_col',  -- Grouping regression column names, separated by comma, default NULL
             max_iter,        -- Maximum iteration number for logistic regression, default 20
             'optimizer',     -- Optimization method for logistic regression, default 'irls'
             tolerance,       -- When difference of likelihoods in two consecutive iterations smaller than
                              -- this value, stops the computation. Default 0.0001
             verbose          -- Whether print detailed information when computing logistic regression,
                              -- default is False
         );

         Output:
         ----------------------------------------------------------------
         The output table has the following columns:
             ref_category    INTEGER              -- The reference category used in the multi-logistic regression
             coef            DOUBLE PRECISION[],  -- Fitting coefficients
             std_err         DOUBLE PRECISION[],  -- Clustered standard errors for coef
             z_stats         DOUBLE PRECISION[],  -- z-stats of the errors
             p_values        DOUBLE PRECISION[]   -- p-values of the errors
         """.format(schema_madlib = schema_madlib)

	# ------------------------------------------------------------------------
	# Compute clustered errors
	# ------------------------------------------------------------------------

	import plpy
	from utilities.utilities import __unique_string
	from utilities.utilities import _array_to_string
	from utilities.utilities import _string_to_array
	from utilities.validate_args import columns_exist_in_table
	from utilities.validate_args import table_is_empty
	from utilities.validate_args import scalar_col_has_no_null
	from utilities.validate_args import table_exists

	# ========================================================================

	def __prepare_strings(clustervar, grouping_col):
	"""
	Prepare the needed strings
	"""
	fitres = __unique_string() # result table for linear regression

	if grouping_col is None:
	grouping_col_str = "NULL"
	else:
	grouping_col_str = "'" + grouping_col + "'"

	coef_str = __unique_string()
	if clustervar is None:
	cluster_grouping_str = "group by {coef_str}".format(coef_str = coef_str)
	else:
	cluster_grouping_str = "group by {coef_str}, ".format(coef_str = coef_str) + clustervar

	return (fitres, coef_str, cluster_grouping_str, grouping_col_str)

	# ------------------------------------------------------------------------

	def __generate_clustered_sql (**kwargs):
	"""
	Create the SQL query to execute and create the result table
	"""


	sqlPart1 = """
	create table {tbl_output} as
	select (f).* from (
	select {schema_madlib}.__clustered_{regr_type}_compute_stats(
	max(coef),
	m4_ifdef(`__POSTGRESQL__', `{schema_madlib}.__array_')sum(meatvec),
	m4_ifdef(`__POSTGRESQL__', `{schema_madlib}.__array_')sum(breadvec),
	count(numRows)::integer, (sum(numRows))::integer) as f
	from (
	select (g).*, coef, numRows from (
	select""".format(**kwargs)

	if(kwargs['regr_type'] == 'mlog'): #We need to know the reference category and number of categories if doing the mlog regression
	sqlPart2=""" {schema_madlib}.__clustered_err_{regr_type}_step({depvar},
	{indvar}, {coef_str}, (select count(distinct {depvar}) from {tbl_data})::INTEGER, ({ref_category})::INTEGER ) as g,""".format(**kwargs)
	else:
	sqlPart2=""" {schema_madlib}.__clustered_err_{regr_type}_step({depvar},
	{indvar}, {coef_str}) as g,""".format(**kwargs)

	sqlPart3=""" {coef_str} as coef,
	count({depvar}) as numRows
	from (
	select u.coef as {coef_str}, v.*
	from
	{fitres} u, {tbl_data} v
	) s
	{cluster_grouping_str}) t) p) q
	""".format(**kwargs)
	return sqlPart1 + sqlPart2 + sqlPart3


	# ========================================================================

	def clustered_variance_linregr (schema_madlib, tbl_data, tbl_output,
	depvar, indvar, clustervar, grouping_col,
	**kwargs):
	"""
	Linear regression clustered standard errors
	"""
	old_msg_level = plpy.execute("select setting from pg_settings where \
	name='client_min_messages'")[0]['setting']
	plpy.execute("set client_min_messages to error")

	validate_args_clustered_variance_linregr(schema_madlib, tbl_data,
	tbl_output, depvar, indvar,
	clustervar, grouping_col)

	(fitres, coef_str, cluster_grouping_str,
	grouping_col_str) = __prepare_strings(clustervar, grouping_col)

	plpy.execute(
	"""
	select {schema_madlib}.linregr_train('{tbl_data}',
	'{fitres}', '{depvar}', '{indvar}', {grouping_col})
	""".format(schema_madlib = schema_madlib, tbl_data = tbl_data,
	fitres = fitres, depvar = depvar, indvar = indvar,
	grouping_col = grouping_col_str))

	plpy.execute(
	__generate_clustered_sql(schema_madlib = schema_madlib, depvar = depvar,
	indvar = indvar, coef_str = coef_str,
	tbl_data = tbl_data, fitres = fitres,
	cluster_grouping_str = cluster_grouping_str,
	tbl_output = tbl_output, regr_type = "lin"))

	plpy.execute(
	"""
	drop table if exists {fitres}
	""".format(fitres = fitres))

	plpy.execute("set client_min_messages to " + old_msg_level)
	return None

	# ========================================================================

	def validate_args_clustered_variance (schema_madlib, tbl_data,
	tbl_output, depvar, indvar,
	clustervar, grouping_col):
	"""
	Validate the parameters
	"""
	if not tbl_data or tbl_data in ('null', '') or not table_exists(tbl_data):
	plpy.error("Clustered variance estimation error: Data table does not exist!")

	if table_is_empty(tbl_data):
	plpy.error("Clustered variance estimation error: Data table is empty!")

	if tbl_output is None or tbl_output.lower() in ('null', ''):
	plpy.error("Clustered variance estimation error: Invalid output table name!")

	if table_exists(tbl_output):
	plpy.error("Clustered variance estimation error: Output table exists!")

	if depvar is None or (not isinstance(depvar, str)) or depvar.lower() in ('null', ''):
	plpy.error("Clustered variance estimation error: Invalid dependent column name!")

	if indvar is None or (not isinstance(indvar, str)) or indvar.lower() in ('null', ''):
	plpy.error("Clustered variance estimation error: Invalid independent column name!")

	# try:
	# plpy.execute("select {indvar}".format(indvar = indvar))
	# isconst = True
	# except:
	# isconst = False
	# if isconst:
	# plpy.error("Clustered variance estimation error: independent variable is a constant!")

	# try:
	# plpy.execute("select {depvar}".format(depvar = depvar))
	# notconst = False
	# except:
	# notconst = True
	# if notconst is False:
	# plpy.error("Clustered variance estimation error: dependent variable is a constant!")

	# try:
	# plpy.execute("select {indvar} from {tbl} limit 1".format(indvar = indvar, tbl = tbl_data))
	# success = True
	# except:
	# success = False
	# if success is False:
	# plpy.error("Clustered variance estimation error: independent variable does not exist in the data table!")

	# try:
	# plpy.execute("select {depvar} from {tbl} limit 1".format(depvar = depvar, tbl = tbl_data))
	# success = True
	# except:
	# success = False
	# if not success:
	# plpy.error("Clustered variance estimation error: dependent variable does not exist in the data table!")

	if not scalar_col_has_no_null(tbl_data, depvar):
	plpy.error("Clustered variance estimation error: Dependent variable has Null values! \
	Please filter out Null values before using this function!")

	if clustervar is None or clustervar.lower() in ('null', ''):
	# clustervar is optional but if provided should be valid column name
	plpy.error("Clustered variance estimation error: Invalid cluster columns name!")

	if clustervar is not None:
	if not columns_exist_in_table(tbl_data,
	_string_to_array(clustervar), schema_madlib):
	plpy.error("Clustered variance estimation error: Cluster column does not exist!")

	if grouping_col is not None and grouping_col.lower() in ('null', ''):
	# grouping_col is optional but if provided should be valid column name
	plpy.error("Clustered variance estimation error: Invalid grouping columns name!")

	if grouping_col:
	if not columns_exist_in_table(tbl_data,
	_string_to_array(grouping_col), schema_madlib):
	plpy.error("Clustered variance estimation error: Grouping column does not exist!")


	# ========================================================================

	def validate_args_clustered_variance_linregr(schema_madlib, tbl_data,
	tbl_output, depvar, indvar,
	clustervar, grouping_col):
	"""
	Validate the parameters
	"""
	validate_args_clustered_variance(schema_madlib, tbl_data,
	tbl_output, depvar, indvar,
	clustervar, grouping_col)

	# ========================================================================

	def clustered_variance_linregr_help (schema_madlib, msg = None, **kwargs):
	"""
	Print help messages
	"""
	if msg is None or msg.strip(' ') == 'help' or msg.strip(' ') == '?':
	return """
	----------------------------------------------------------------
	Summary
	----------------------------------------------------------------
	Computes the clustered standard errors for linear regression.

	The function first runs a linear regression to get the fitting
	coefficients. Then it computes the clustered standard errors for
	the linear regression.

	SELECT {schema_madlib}.clustered_variance_linregr(
	'tbl_data',
	'tbl_output',
	'depvar',
	'indvar',
	'clustervar',
	'grouping_col'
	);

	--
	Run:
	SELECT {schema_madlib}.clustered_variance_linregr('usage');

	to get more information.
	""".format(schema_madlib = schema_madlib)

	if msg.strip(' ') == 'usage':
	return """
	Usage:
	----------------------------------------------------------------
	SELECT {schema_madlib}.clustered_variance_linregr(
	'tbl_data', -- Name of data table
	'tbl_output', -- Name of result table (raise an error if it already exists)
	'depvar', -- Expression for dependent variable
	'indvar', -- Expression for independent variables
	'clustervar', -- Column names for cluster variables, separated by comma
	'grouping_col' -- Grouping regression column names, separated by comma, default NULL
	);

	Output:
	----------------------------------------------------------------
	The output table has the following columns:
	coef DOUBLE PRECISION[], -- Fitting coefficients
	std_err DOUBLE PRECISION[], -- Clustered standard errors for coef
	t_stats DOUBLE PRECISION[], -- t-stats of the errors
	p_values DOUBLE PRECISION[] -- p-values of the errors
	""".format(schema_madlib = schema_madlib)

	# ========================================================================
	# Clustered errors for logistic regression
	# ========================================================================

	def clustered_variance_logregr (schema_madlib, tbl_data, tbl_output,
	depvar, indvar, clustervar, grouping_col,
	max_iter, optimizer, tolerance, verbose,
	**kwargs):
	"""
	Logistic regression clustered standard errors
	"""
	validate_args_clustered_variance_logregr(schema_madlib, tbl_data,
	tbl_output, depvar, indvar,
	clustervar, grouping_col,
	max_iter, optimizer, tolerance,
	verbose)

	(fitres, coef_str, cluster_grouping_str,
	grouping_col_str) = __prepare_strings(clustervar, grouping_col)

	plpy.execute(
	"""
	select {schema_madlib}.logregr_train('{tbl_data}',
	'{fitres}', '{depvar}', '{indvar}', {grouping_col},
	{max_iter}, '{optimizer}', {tolerance}, {verbose})
	""".format(schema_madlib = schema_madlib, tbl_data = tbl_data,
	fitres = fitres, depvar = depvar, indvar = indvar,
	grouping_col = grouping_col_str, max_iter = max_iter,
	optimizer = optimizer, tolerance = tolerance,
	verbose = verbose))

	old_msg_level = plpy.execute("select setting from pg_settings where \
	name='client_min_messages'")[0]['setting']
	plpy.execute("set client_min_messages to error")

	plpy.execute(
	__generate_clustered_sql(schema_madlib = schema_madlib, depvar = depvar,
	indvar = indvar, coef_str = coef_str,
	tbl_data = tbl_data, fitres = fitres,
	cluster_grouping_str = cluster_grouping_str,
	tbl_output = tbl_output, regr_type = "log"))

	plpy.execute(
	"""
	drop table if exists {fitres}
	""".format(fitres = fitres))

	plpy.execute("set client_min_messages to " + old_msg_level)
	return None

	# ========================================================================

	def validate_args_clustered_variance_logregr(schema_madlib, tbl_data,
	tbl_output, depvar, indvar,
	clustervar, grouping_col,
	max_iter, optimizer, tolerance,
	verbose):
	"""
	Validate the parameters
	"""
	validate_args_clustered_variance(schema_madlib, tbl_data,
	tbl_output, depvar, indvar,
	clustervar, grouping_col)
	if max_iter is None or max_iter <= 0:
	plpy.error("Clustered variance estimation error: Maximum number of iterations must be a positive number!")

	if tolerance is None or tolerance < 0:
	plpy.error("Clustered variance estimation error: The tolerance must be a non-negative number!")

	if verbose not in [True, False]:
	plpy.error("Clustered variance estimation error: verbose option must be a boolean!")

	if optimizer == "newton":
	optimizer = "irls"
	elif optimizer not in ("irls", "cg", "igd"):
	plpy.error(""" Clustered variance estimation error: Unknown optimizer requested.
	Must be 'newton'/'irls', 'cg', or 'igd'.
	""")

	# ========================================================================

	def clustered_variance_logregr_help (schema_madlib, msg = None, **kwargs):
	"""
	Print help messages
	"""
	if msg is None or msg.strip(' ') == 'help' or msg.strip(' ') == '?':
	return """
	----------------------------------------------------------------
	Summary
	----------------------------------------------------------------
	Computes the clustered standard errors for logistic regression.

	The function first runs a logistic regression to get the fitting
	coefficients. Then it computes the clustered standard errors for
	the logistic regression.

	SELECT {schema_madlib}.clustered_variance_logregr(
	'tbl_data',
	'tbl_output',
	'depvar',
	'indvar',
	'clustervar',
	'grouping_col',
	max_iter,
	'optimizer',
	tolerance,
	verbose
	);

	--
	Run:
	SELECT {schema_madlib}.clustered_variance_logregr('usage');

	to get more information.
	""".format(schema_madlib = schema_madlib)

	if msg.strip(' ') == 'usage':
	return """
	Usage:
	----------------------------------------------------------------
	SELECT {schema_madlib}.clustered_variance_logregr(
	'tbl_data', -- Name of data table
	'tbl_output', -- Name of result table (raise an error if it already exists)
	'depvar', -- Expression for dependent variable
	'indvar', -- Expression for independent variables
	'clustervar', -- Column names for cluster variables, separated by comma
	'grouping_col', -- Grouping regression column names, separated by comma, default NULL
	max_iter, -- Maximum iteration number for logistic regression, default 20
	'optimizer', -- Optimization method for logistic regression, default 'irls'
	tolerance, -- When difference of likelihoods in two consecutive iterations smaller than
	-- this value, stops the computation. Default 0.0001
	verbose -- Whether print detailed information when computing logistic regression,
	-- default is False
	);

	Output:
	----------------------------------------------------------------
	The output table has the following columns:
	coef DOUBLE PRECISION[], -- Fitting coefficients
	std_err DOUBLE PRECISION[], -- Clustered standard errors for coef
	z_stats DOUBLE PRECISION[], -- z-stats of the errors
	p_values DOUBLE PRECISION[] -- p-values of the errors
	""".format(schema_madlib = schema_madlib)


	# ========================================================================
	# Clustered errors for multi-logistic regression
	# ========================================================================

	#NOTE. Because of the current mlogregr interface, the "verbose" parameter doesn't actually do anything at the moment
	def clustered_variance_mlogregr (schema_madlib, tbl_data, tbl_output,
	depvar, indvar, clustervar, ref_category, grouping_col,
	max_iter, optimizer, tolerance, verbose,
	**kwargs):
	"""
	Multi-Logistic regression clustered standard errors
	"""
	validate_args_clustered_variance_mlogregr(schema_madlib, tbl_data,
	tbl_output, depvar, indvar,
	clustervar, grouping_col,
	max_iter, optimizer, tolerance,
	verbose)

	(fitres, coef_str, cluster_grouping_str,
	grouping_col_str) = __prepare_strings(clustervar, grouping_col)

	old_msg_level = plpy.execute("select setting from pg_settings where \
	name='client_min_messages'")[0]['setting']
	plpy.execute("set client_min_messages to error")

	#The multi-logistic has a different interface than the rest of the regressions, so we have to create a table to hold the result
	plpy.execute("""
	create table {fitres}(coef float8[]);
	""".format(fitres = fitres))

	plpy.execute(
	"""
	insert into {fitres} (select (select coef from {schema_madlib}.mlogregr('{tbl_data}',
	'{depvar}', '{indvar}',
	{max_iter}, '{optimizer}', {tolerance}, {ref_category})))
	""".format(schema_madlib = schema_madlib, tbl_data = tbl_data,
	fitres = fitres, depvar = depvar, indvar = indvar,
	max_iter = max_iter,
	optimizer = optimizer, tolerance = tolerance,
	ref_category = ref_category))


	plpy.execute(
	__generate_clustered_sql(schema_madlib = schema_madlib, depvar = depvar,
	indvar = indvar, coef_str = coef_str,
	tbl_data = tbl_data, fitres = fitres,
	cluster_grouping_str = cluster_grouping_str,
	tbl_output = tbl_output, ref_category=ref_category,
	regr_type = "mlog"))

	plpy.execute(
	"""
	drop table if exists {fitres}
	""".format(fitres = fitres))

	##Add in the reference category to the output column.
	plpy.execute(
	"""
	ALTER table {tbl_output} add column ref_category INTEGER;
	""".format(tbl_output = tbl_output)
	)

	plpy.execute(
	"""
	update {tbl_output} set ref_category={ref_category};
	""".format(tbl_output = tbl_output, ref_category=ref_category)
	)

	plpy.execute("set client_min_messages to " + old_msg_level)
	return None

	# ========================================================================

	def validate_args_clustered_variance_mlogregr(schema_madlib, tbl_data,
	tbl_output, depvar, indvar,
	clustervar, grouping_col,
	max_iter, optimizer, tolerance,
	verbose):
	"""
	Validate the parameters
	"""
	validate_args_clustered_variance(schema_madlib, tbl_data,
	tbl_output, depvar, indvar,
	clustervar, grouping_col)
	if max_iter <= 0:
	plpy.error("Clustered variance estimation error: Maximum number of iterations must be positive!")

	if tolerance < 0:
	plpy.error("Clustered variance estimation error: The tolerance cannot be negative!")

	if optimizer == "newton":
	optimizer = "irls"
	elif optimizer not in ("irls"):
	plpy.error(""" Clustered variance estimation error: Unknown optimizer requested.
	Must be 'newton'/'irls'.
	""")

	# ========================================================================

	def clustered_variance_mlogregr_help (schema_madlib, msg = None, **kwargs):
	"""
	Print help messages
	"""
	if msg is None or msg.strip(' ') == 'help' or msg.strip(' ') == '?':
	return """
	----------------------------------------------------------------
	Summary
	----------------------------------------------------------------
	Computes the clustered standard errors for multi-logistic regression.

	The function first runs a multi-logistic regression to get the fitting
	coefficients. Then it computes the clustered standard errors for
	the multi-logistic regression.

	SELECT {schema_madlib}.clustered_variance_mlogregr(
	'tbl_data',
	'tbl_output',
	'depvar',
	'indvar',
	'clustervar',
	'ref_category',
	'grouping_col',
	max_iter,
	'optimizer',
	tolerance,
	verbose
	);

	--
	Run:
	SELECT {schema_madlib}.clustered_variance_mlogregr('usage');

	to get more information.
	""".format(schema_madlib = schema_madlib)

	if msg.strip(' ') == 'usage':
	return """
	Usage:
	----------------------------------------------------------------
	SELECT {schema_madlib}.clustered_variance_mlogregr(
	'tbl_data', -- Name of data table
	'tbl_output', -- Name of result table (raise an error if it already exists)
	'depvar', -- Expression for dependent variable
	'indvar', -- Expression for independent variables
	'clustervar', -- Column names for cluster variables, separated by comma
	'ref_category', -- Reference category for the multi-logistic regression
	'grouping_col', -- Grouping regression column names, separated by comma, default NULL
	max_iter, -- Maximum iteration number for logistic regression, default 20
	'optimizer', -- Optimization method for logistic regression, default 'irls'
	tolerance, -- When difference of likelihoods in two consecutive iterations smaller than
	-- this value, stops the computation. Default 0.0001
	verbose -- Whether print detailed information when computing logistic regression,
	-- default is False
	);

	Output:
	----------------------------------------------------------------
	The output table has the following columns:
	ref_category INTEGER -- The reference category used in the multi-logistic regression
	coef DOUBLE PRECISION[], -- Fitting coefficients
	std_err DOUBLE PRECISION[], -- Clustered standard errors for coef
	z_stats DOUBLE PRECISION[], -- z-stats of the errors
	p_values DOUBLE PRECISION[] -- p-values of the errors
	""".format(schema_madlib = schema_madlib)