| # coding=utf-8 |
| |
| |
| """ |
| @file robust_linear.py_in |
| |
| @namespace robust |
| |
| @brief Robust variance: Common functions |
| """ |
| import plpy |
| from utilities.utilities import _assert |
| from utilities.utilities import unique_string |
| from utilities.validate_args import rename_table |
| |
| # use mad_vec to process arrays passed as strings in GPDB < 4.1 and PG < 9.0 |
| from utilities.utilities import __mad_version |
| version_wrapper = __mad_version() |
| string_to_array = version_wrapper.select_vecfunc() |
| array_to_string = version_wrapper.select_vec_return() |
| |
| from regress.robust_linear import _robust_linregr_validate |
| |
| |
| def _robust_logregr_validate(max_iter, optimizer, tolerance, **kwargs): |
| _assert(max_iter is not None, |
| "Robust Variance error: The max_iter should not be NULL!") |
| _assert(max_iter > 0, |
| "Robust Variance error: Maximum number of iterations must be positive!") |
| |
| _assert(tolerance is not None, |
| "Robust Variance error: The tolerance should not be NULL!") |
| _assert(tolerance >= 0, |
| "Robust Variance error: The tolerance cannot be negative!") |
| |
| _assert(optimizer is not None, |
| "Robust Variance error: The optimizer should not be NULL!") |
| _assert(optimizer.lower() in ("irls", "newton"), |
| "Robust Variance error: Optimizer does not exist. Must be 'newton'/'irls'.") |
| |
| |
| def robust_logregr_help(schema_madlib, message, **kwargs): |
| if not message: |
| help_string = """ |
| ----------------------------------------------------------------------- |
| SUMMARY |
| ----------------------------------------------------------------------- |
| Functionality: Calculate Huber-White robust statistics for logistic regression |
| |
| For more details on function usage: |
| SELECT {schema_madlib}.robust_variance_logregr('usage'); |
| """ |
| elif message in ['usage', 'help', '?']: |
| help_string = """ |
| ----------------------------------------------------------------------- |
| USAGE |
| ----------------------------------------------------------------------- |
| SELECT {schema_madlib}.robust_variance_logregr( |
| 'source_table', -- Name of data table |
| 'output_table', -- Name of result table |
| 'dependent_varname', -- Name of column for dependent variables |
| 'independent_varname', -- Name of column for independent variables |
| (can be any SQL expression that evaluates to an array) |
| 'grouping_cols', -- [OPTIONAL] Comma separated string with columns to group by. Default is NULL. |
| 'max_iter', -- [OPTIONAL] The number of iterations used by the logistic regression solver. Default is 20. |
| 'optimizer', -- [OPTIONAL] Name of the optimizer used in the logistic regression. Default is irls. |
| 'tolerance', -- [OPTIONAL] The tolerance of the logistic regression optimizer. Default is 0.0001. |
| 'verbose' -- [OPTIONAL] Should the optimizer print warning messages to the screen. Default is FALSE. |
| ); |
| ----------------------------------------------------------------------- |
| OUTUPT |
| ----------------------------------------------------------------------- |
| The output table ('output_table' above) has the following columns: |
| 'coef' DOUBLE PRECISION[], -- Coefficients of regression |
| 'std_err' DOUBLE PRECISION[], -- Huber-White standard errors |
| 'stats' DOUBLE PRECISION[], -- Z-stats of the standard errors |
| 'p_values' DOUBLE PRECISION[] -- p-values of the standard errors |
| |
| The output summary table is the same as logregr_train(), see also: |
| SELECT logregr_train('usage'); |
| """ |
| else: |
| help_string = "No such option. Use {schema_madlib}.robust_variance_linregr()" |
| |
| return help_string.format(schema_madlib=schema_madlib) |
| |
| # ------------------------------------------------------------------------- |
| |
| def robust_variance_logregr( |
| schema_madlib, source_table, out_table, dependent_varname, |
| independent_varname, grouping_cols=None, max_iter=20, optimizer='irls', |
| tolerance=0.0001, verbose=False, **kwargs): |
| """ |
| @brief A wrapper function for the robust_variance_mlogregr. |
| |
| @param source_table string, name of the input table |
| @param out_table string, name of the output table to be created |
| @param dependent_varname: string, Column containing the dependent variable |
| @param independent_varname string, Column containing the array of independent variables |
| @param grouping_cols string, Set of columns to group by. |
| |
| To include an intercept in the model, set one coordinate in the |
| <tt>independentVariables</tt> array to 1. |
| |
| Returns: |
| None |
| """ |
| # Reset the message level to avoid random messages |
| old_msg_level = plpy.execute(""" |
| SELECT setting |
| FROM pg_settings |
| WHERE name='client_min_messages' |
| """)[0]['setting'] |
| if verbose: |
| plpy.execute('SET client_min_messages TO warning') |
| else: |
| plpy.execute('SET client_min_messages TO error') |
| |
| if optimizer is not None and optimizer.lower() == 'newton': |
| optimizer = 'irls' |
| _robust_linregr_validate(schema_madlib, source_table, out_table, |
| dependent_varname, independent_varname, grouping_cols, verbose) |
| _robust_logregr_validate(max_iter, optimizer, tolerance) |
| |
| group_str = '' if grouping_cols is None else 'GROUP BY %s' % grouping_cols |
| group_str_sel = '' if grouping_cols is None else grouping_cols + ',' |
| join_str = ',' if grouping_cols is None else 'JOIN' |
| using_str = '' if grouping_cols is None else 'USING (%s)' % grouping_cols |
| group_col_str = 'NULL' if grouping_cols is None else "'" + grouping_cols + "'" |
| optimizer_str = 'NULL' if optimizer is None else "'" + optimizer + "'" |
| maxiter_str = 'NULL' if max_iter is None else max_iter |
| tolerance_str = 'NULL' if tolerance is None else tolerance |
| |
| logr_out_table = "pg_temp." + unique_string() |
| rb_model = unique_string() |
| |
| # Run logistic regression |
| plpy.execute(""" |
| SELECT {schema_madlib}.logregr_train( |
| '{source_table}', '{logr_out_table}', |
| '{dependent_varname}', '{independent_varname}', {group_col_str}, |
| {maxiter_str}, {optimizer_str}, {tolerance_str}, {verbose}) |
| """.format(schema_madlib=schema_madlib, source_table=source_table, |
| logr_out_table=logr_out_table, |
| dependent_varname=dependent_varname, |
| independent_varname=independent_varname, |
| group_col_str=group_col_str, |
| maxiter_str=maxiter_str, optimizer_str=optimizer_str, |
| tolerance_str=tolerance_str, verbose=verbose)) |
| |
| plpy.execute(""" |
| CREATE TABLE {out_table}_summary AS |
| SELECT |
| source_table, |
| '{out_table}'::TEXT AS out_table, |
| dependent_varname, |
| independent_varname, |
| optimizer_params, |
| num_all_groups, |
| num_failed_groups, |
| num_rows_processed, |
| num_missing_rows_skipped |
| FROM |
| {old}_summary |
| """.format(out_table=out_table, old=logr_out_table)) |
| |
| # Run robust logistic regression |
| plpy.execute(""" |
| CREATE TABLE {out_table} AS |
| SELECT |
| {group_str_sel} |
| ({rb_model}).coef, ({rb_model}).std_err, |
| ({rb_model}).z_stats, ({rb_model}).p_values |
| FROM |
| ( |
| SELECT |
| {group_str_sel} |
| {schema_madlib}.robust_logregr( |
| ({dependent_varname})::BOOLEAN, |
| {independent_varname}, |
| {logr_out_table}.coef) AS {rb_model} |
| FROM |
| {source_table} {join_str} {logr_out_table} {using_str} |
| {group_str} |
| ) t1 |
| """.format(schema_madlib=schema_madlib, |
| source_table=source_table, out_table=out_table, |
| dependent_varname=dependent_varname, |
| independent_varname=independent_varname, |
| group_str_sel=group_str_sel, group_str=group_str, |
| join_str=join_str, using_str=using_str, |
| logr_out_table=logr_out_table, rb_model=rb_model)) |
| |
| # Drop the output table of logregr_train() |
| plpy.execute('DROP TABLE IF EXISTS ' + logr_out_table) |
| plpy.execute("SET client_min_messages TO %s" % old_msg_level) |