src/ports/postgres/modules/regress/robust_logistic.py_in - madlib - Git at Google

 # coding=utf-8


 """
 @file robust_linear.py_in

 @namespace robust

 @brief Robust variance: Common functions
 """
 import plpy
 from utilities.utilities import _assert
 from utilities.utilities import unique_string
 from utilities.utilities import rename_table

 # use mad_vec to process arrays passed as strings in GPDB < 4.1 and PG < 9.0
 from utilities.utilities import __mad_version
 version_wrapper = __mad_version()
 string_to_array = version_wrapper.select_vecfunc()
 array_to_string = version_wrapper.select_vec_return()

 from regress.robust_linear import _robust_linregr_validate


 def _robust_logregr_validate(max_iter, optimizer, tolerance, **kwargs):
     _assert(max_iter is not None,
             "Robust Variance error: The max_iter should not be NULL!")
     _assert(max_iter > 0,
             "Robust Variance error: Maximum number of iterations must be positive!")

     _assert(tolerance is not None,
             "Robust Variance error: The tolerance should not be NULL!")
     _assert(tolerance >= 0,
             "Robust Variance error: The tolerance cannot be negative!")

     _assert(optimizer is not None,
             "Robust Variance error: The optimizer should not be NULL!")
     _assert(optimizer.lower() in ("irls", "newton"),
             "Robust Variance error: Optimizer does not exist.  Must be 'newton'/'irls'.")


 def robust_logregr_help(schema_madlib, message,  **kwargs):
     if not message:
         help_string = """
 -----------------------------------------------------------------------
                             SUMMARY
 -----------------------------------------------------------------------
 Functionality: Calculate Huber-White robust statistics for logistic regression

 For more details on function usage:
     SELECT {schema_madlib}.robust_variance_logregr('usage');
         """
     elif message in ['usage', 'help', '?']:
         help_string = """
 -----------------------------------------------------------------------
                             USAGE
 -----------------------------------------------------------------------
     SELECT {schema_madlib}.robust_variance_logregr(
         'source_table',         -- Name of data table
         'output_table',         -- Name of result table
         'dependent_varname',    -- Name of column for dependent variables
         'independent_varname',  -- Name of column for independent variables
                                    (can be any SQL expression that evaluates to an array)
         'grouping_cols',        -- [OPTIONAL] Comma separated string with columns to group by.  Default is NULL.
         'max_iter',             -- [OPTIONAL] The number of iterations used by the logistic regression solver.  Default is 20.
         'optimizer',            -- [OPTIONAL] Name of the optimizer used in the logistic regression.  Default is irls.
         'tolerance',            -- [OPTIONAL] The tolerance of the logistic regression optimizer.  Default is 0.0001.
         'verbose'               -- [OPTIONAL] Should the optimizer print warning messages to the screen.  Default is FALSE.
         );
 -----------------------------------------------------------------------
                             OUTUPT
 -----------------------------------------------------------------------
 The output table ('output_table' above) has the following columns:
     'coef'      DOUBLE PRECISION[],  -- Coefficients of regression
     'std_err'   DOUBLE PRECISION[],  -- Huber-White standard errors
     'stats'     DOUBLE PRECISION[],  -- Z-stats of the standard errors
     'p_values'  DOUBLE PRECISION[]   -- p-values of the standard errors

 The output summary table is the same as logregr_train(), see also:
     SELECT logregr_train('usage');
         """
     else:
         help_string = "No such option. Use {schema_madlib}.robust_variance_linregr()"

     return help_string.format(schema_madlib=schema_madlib)

 # -------------------------------------------------------------------------

 def robust_variance_logregr(
         schema_madlib, source_table, out_table, dependent_varname,
         independent_varname, grouping_cols=None, max_iter=20, optimizer='irls',
         tolerance=0.0001, verbose=False, **kwargs):
     """
     @brief A wrapper function for the robust_variance_mlogregr.

     @param source_table string, name of the input table
     @param out_table string, name of the output table to be created
     @param dependent_varname: string, Column containing the dependent variable
     @param independent_varname string, Column containing the array of independent variables
     @param grouping_cols string, Set of columns to group by.

     To include an intercept in the model, set one coordinate in the
     <tt>independentVariables</tt> array to 1.

     Returns:
         None
     """
     # Reset the message level to avoid random messages
     old_msg_level = plpy.execute("""
                                   SELECT setting
                                   FROM pg_settings
                                   WHERE name='client_min_messages'
                                   """)[0]['setting']
     if verbose:
         plpy.execute('SET client_min_messages TO warning')
     else:
         plpy.execute('SET client_min_messages TO error')

     if optimizer is not None and optimizer.lower() == 'newton':
         optimizer = 'irls'
     _robust_linregr_validate(schema_madlib, source_table, out_table,
             dependent_varname, independent_varname, grouping_cols, verbose)
     _robust_logregr_validate(max_iter, optimizer, tolerance)

     group_str = '' if grouping_cols is None else 'GROUP BY %s' % grouping_cols
     group_str_sel = '' if grouping_cols is None else grouping_cols + ','
     join_str = ',' if grouping_cols is None else 'JOIN'
     using_str = '' if grouping_cols is None else 'USING (%s)' % grouping_cols
     group_col_str = 'NULL' if grouping_cols is None else "'" + grouping_cols + "'"
     optimizer_str = 'NULL' if optimizer is None else "'" + optimizer + "'"
     maxiter_str = 'NULL' if max_iter is None else max_iter
     tolerance_str = 'NULL' if tolerance is None else tolerance

     logr_out_table = "pg_temp." + unique_string()
     rb_model = unique_string()

     # Run logistic regression
     plpy.execute("""
         SELECT {schema_madlib}.logregr_train(
             '{source_table}', '{logr_out_table}',
             '{dependent_varname}', '{independent_varname}', {group_col_str},
             {maxiter_str}, {optimizer_str}, {tolerance_str}, {verbose})
         """.format(schema_madlib=schema_madlib, source_table=source_table,
                    logr_out_table=logr_out_table,
                    dependent_varname=dependent_varname,
                    independent_varname=independent_varname,
                    group_col_str=group_col_str,
                    maxiter_str=maxiter_str, optimizer_str=optimizer_str,
                    tolerance_str=tolerance_str, verbose=verbose))

     plpy.execute("""
         CREATE TABLE {out_table}_summary AS
         SELECT
             source_table,
             '{out_table}'::TEXT AS out_table,
             dependent_varname,
             independent_varname,
             optimizer_params,
             num_all_groups,
             num_failed_groups,
             num_rows_processed,
             num_missing_rows_skipped
         FROM
             {old}_summary
         """.format(out_table=out_table, old=logr_out_table))

     # Run robust logistic regression
     plpy.execute("""
         CREATE TABLE {out_table} AS
         SELECT
             {group_str_sel}
             ({rb_model}).coef, ({rb_model}).std_err,
             ({rb_model}).z_stats, ({rb_model}).p_values
         FROM
         (
             SELECT
                 {group_str_sel}
                 {schema_madlib}.robust_logregr(
                     ({dependent_varname})::BOOLEAN,
                     {independent_varname},
                     {logr_out_table}.coef) AS {rb_model}
             FROM
                 {source_table} {join_str} {logr_out_table}  {using_str}
             {group_str}
         ) t1
         """.format(schema_madlib=schema_madlib,
                    source_table=source_table, out_table=out_table,
                    dependent_varname=dependent_varname,
                    independent_varname=independent_varname,
                    group_str_sel=group_str_sel, group_str=group_str,
                    join_str=join_str, using_str=using_str,
                    logr_out_table=logr_out_table, rb_model=rb_model))

     # Drop the output table of logregr_train()
     plpy.execute('DROP TABLE IF EXISTS ' + logr_out_table)
     plpy.execute("SET client_min_messages TO %s" % old_msg_level)
	# coding=utf-8


	"""
	@file robust_linear.py_in

	@namespace robust

	@brief Robust variance: Common functions
	"""
	import plpy
	from utilities.utilities import _assert
	from utilities.utilities import unique_string
	from utilities.utilities import rename_table

	# use mad_vec to process arrays passed as strings in GPDB < 4.1 and PG < 9.0
	from utilities.utilities import __mad_version
	version_wrapper = __mad_version()
	string_to_array = version_wrapper.select_vecfunc()
	array_to_string = version_wrapper.select_vec_return()

	from regress.robust_linear import _robust_linregr_validate


	def _robust_logregr_validate(max_iter, optimizer, tolerance, **kwargs):
	_assert(max_iter is not None,
	"Robust Variance error: The max_iter should not be NULL!")
	_assert(max_iter > 0,
	"Robust Variance error: Maximum number of iterations must be positive!")

	_assert(tolerance is not None,
	"Robust Variance error: The tolerance should not be NULL!")
	_assert(tolerance >= 0,
	"Robust Variance error: The tolerance cannot be negative!")

	_assert(optimizer is not None,
	"Robust Variance error: The optimizer should not be NULL!")
	_assert(optimizer.lower() in ("irls", "newton"),
	"Robust Variance error: Optimizer does not exist. Must be 'newton'/'irls'.")


	def robust_logregr_help(schema_madlib, message, **kwargs):
	if not message:
	help_string = """
	-----------------------------------------------------------------------
	SUMMARY
	-----------------------------------------------------------------------
	Functionality: Calculate Huber-White robust statistics for logistic regression

	For more details on function usage:
	SELECT {schema_madlib}.robust_variance_logregr('usage');
	"""
	elif message in ['usage', 'help', '?']:
	help_string = """
	-----------------------------------------------------------------------
	USAGE
	-----------------------------------------------------------------------
	SELECT {schema_madlib}.robust_variance_logregr(
	'source_table', -- Name of data table
	'output_table', -- Name of result table
	'dependent_varname', -- Name of column for dependent variables
	'independent_varname', -- Name of column for independent variables
	(can be any SQL expression that evaluates to an array)
	'grouping_cols', -- [OPTIONAL] Comma separated string with columns to group by. Default is NULL.
	'max_iter', -- [OPTIONAL] The number of iterations used by the logistic regression solver. Default is 20.
	'optimizer', -- [OPTIONAL] Name of the optimizer used in the logistic regression. Default is irls.
	'tolerance', -- [OPTIONAL] The tolerance of the logistic regression optimizer. Default is 0.0001.
	'verbose' -- [OPTIONAL] Should the optimizer print warning messages to the screen. Default is FALSE.
	);
	-----------------------------------------------------------------------
	OUTUPT
	-----------------------------------------------------------------------
	The output table ('output_table' above) has the following columns:
	'coef' DOUBLE PRECISION[], -- Coefficients of regression
	'std_err' DOUBLE PRECISION[], -- Huber-White standard errors
	'stats' DOUBLE PRECISION[], -- Z-stats of the standard errors
	'p_values' DOUBLE PRECISION[] -- p-values of the standard errors

	The output summary table is the same as logregr_train(), see also:
	SELECT logregr_train('usage');
	"""
	else:
	help_string = "No such option. Use {schema_madlib}.robust_variance_linregr()"

	return help_string.format(schema_madlib=schema_madlib)

	# -------------------------------------------------------------------------

	def robust_variance_logregr(
	schema_madlib, source_table, out_table, dependent_varname,
	independent_varname, grouping_cols=None, max_iter=20, optimizer='irls',
	tolerance=0.0001, verbose=False, **kwargs):
	"""
	@brief A wrapper function for the robust_variance_mlogregr.

	@param source_table string, name of the input table
	@param out_table string, name of the output table to be created
	@param dependent_varname: string, Column containing the dependent variable
	@param independent_varname string, Column containing the array of independent variables
	@param grouping_cols string, Set of columns to group by.

	To include an intercept in the model, set one coordinate in the
	<tt>independentVariables</tt> array to 1.

	Returns:
	None
	"""
	# Reset the message level to avoid random messages
	old_msg_level = plpy.execute("""
	SELECT setting
	FROM pg_settings
	WHERE name='client_min_messages'
	""")[0]['setting']
	if verbose:
	plpy.execute('SET client_min_messages TO warning')
	else:
	plpy.execute('SET client_min_messages TO error')

	if optimizer is not None and optimizer.lower() == 'newton':
	optimizer = 'irls'
	_robust_linregr_validate(schema_madlib, source_table, out_table,
	dependent_varname, independent_varname, grouping_cols, verbose)
	_robust_logregr_validate(max_iter, optimizer, tolerance)

	group_str = '' if grouping_cols is None else 'GROUP BY %s' % grouping_cols
	group_str_sel = '' if grouping_cols is None else grouping_cols + ','
	join_str = ',' if grouping_cols is None else 'JOIN'
	using_str = '' if grouping_cols is None else 'USING (%s)' % grouping_cols
	group_col_str = 'NULL' if grouping_cols is None else "'" + grouping_cols + "'"
	optimizer_str = 'NULL' if optimizer is None else "'" + optimizer + "'"
	maxiter_str = 'NULL' if max_iter is None else max_iter
	tolerance_str = 'NULL' if tolerance is None else tolerance

	logr_out_table = "pg_temp." + unique_string()
	rb_model = unique_string()

	# Run logistic regression
	plpy.execute("""
	SELECT {schema_madlib}.logregr_train(
	'{source_table}', '{logr_out_table}',
	'{dependent_varname}', '{independent_varname}', {group_col_str},
	{maxiter_str}, {optimizer_str}, {tolerance_str}, {verbose})
	""".format(schema_madlib=schema_madlib, source_table=source_table,
	logr_out_table=logr_out_table,
	dependent_varname=dependent_varname,
	independent_varname=independent_varname,
	group_col_str=group_col_str,
	maxiter_str=maxiter_str, optimizer_str=optimizer_str,
	tolerance_str=tolerance_str, verbose=verbose))

	plpy.execute("""
	CREATE TABLE {out_table}_summary AS
	SELECT
	source_table,
	'{out_table}'::TEXT AS out_table,
	dependent_varname,
	independent_varname,
	optimizer_params,
	num_all_groups,
	num_failed_groups,
	num_rows_processed,
	num_missing_rows_skipped
	FROM
	{old}_summary
	""".format(out_table=out_table, old=logr_out_table))

	# Run robust logistic regression
	plpy.execute("""
	CREATE TABLE {out_table} AS
	SELECT
	{group_str_sel}
	({rb_model}).coef, ({rb_model}).std_err,
	({rb_model}).z_stats, ({rb_model}).p_values
	FROM
	(
	SELECT
	{group_str_sel}
	{schema_madlib}.robust_logregr(
	({dependent_varname})::BOOLEAN,
	{independent_varname},
	{logr_out_table}.coef) AS {rb_model}
	FROM
	{source_table} {join_str} {logr_out_table} {using_str}
	{group_str}
	) t1
	""".format(schema_madlib=schema_madlib,
	source_table=source_table, out_table=out_table,
	dependent_varname=dependent_varname,
	independent_varname=independent_varname,
	group_str_sel=group_str_sel, group_str=group_str,
	join_str=join_str, using_str=using_str,
	logr_out_table=logr_out_table, rb_model=rb_model))

	# Drop the output table of logregr_train()
	plpy.execute('DROP TABLE IF EXISTS ' + logr_out_table)
	plpy.execute("SET client_min_messages TO %s" % old_msg_level)