| # coding=utf-8 |
| |
| |
| """ |
| @file robust_mlogistic.py_in |
| |
| @namespace robust |
| |
| @brief Robust variance: Common functions |
| """ |
| import plpy |
| from utilities.utilities import unique_string |
| from utilities.utilities import extract_keyvalue_params |
| from utilities.utilities import _assert |
| from utilities.utilities import add_postfix |
| |
| from utilities.validate_args import table_exists |
| from utilities.utilities import rename_table |
| from utilities.validate_args import columns_exist_in_table |
| |
| from regress.robust_linear import _robust_linregr_validate |
| |
| # use mad_vec to process arrays passed as strings in GPDB < 4.1 and PG < 9.0 |
| from utilities.utilities import __mad_version |
| version_wrapper = __mad_version() |
| string_to_array = version_wrapper.select_vecfunc() |
| array_to_string = version_wrapper.select_vec_return() |
| |
| |
| def _robust_mlogregr_validate(schema_madlib, source_table, |
| dependent_varname, independent_varname, |
| ref_category, max_iter, optimizer, |
| tolerance, **kwargs): |
| |
| _assert(ref_category is not None, |
| "Robust Variance: Reference category cannot be null!") |
| _assert(ref_category >= 0, |
| "Robust Variance error: Reference category cannot be negative!") |
| _assert(max_iter > 0, |
| "Robust Variance error: Maximum number of iterations must be positive!") |
| _assert(tolerance >= 0, |
| "Robust Variance error: The tolerance cannot be negative!") |
| _assert(optimizer.lower() in ("irls", "newton"), |
| "Robust Variance error: Optimizer does not exist. Must be 'newton'/'irls'.") |
| |
| result_w_null = plpy.execute(""" |
| SELECT DISTINCT {dep} AS cat |
| FROM {source} |
| WHERE {dep} is not NULL |
| """.format(source=source_table, |
| indep=independent_varname, |
| dep=dependent_varname)) |
| result_wo_null = plpy.execute(""" |
| SELECT DISTINCT {dep} AS cat |
| FROM {source} |
| WHERE {dep} is not NULL |
| AND NOT {madlib}.array_contains_null({indep}) |
| """.format(madlib=schema_madlib, source=source_table, |
| indep=independent_varname, |
| dep=dependent_varname)) |
| |
| categories_wo_null = set(i["cat"] for i in result_wo_null) |
| categories_w_null = set(i["cat"] for i in result_w_null) |
| |
| _assert(categories_wo_null == categories_w_null, |
| "Robust Variance error: All observations of category set {0} contain " |
| "NULL values. These rows should be removed from the dataset " |
| "before proceeding.". |
| format(list(categories_w_null - categories_wo_null))) |
| # ------------------------------------------------------------------------- |
| |
| |
| # Main function call for robust variance multinomial logistic regression |
| def robust_variance_mlogregr( |
| schema_madlib, source_table, |
| out_table, dependent_varname, |
| independent_varname, ref_category, |
| grouping_cols=None, optimizer_params=None, regr_coef=None, |
| verbose_mode=False, **kwargs): |
| """ |
| @brief A wrapper function for the robust_variance_mlogregr. |
| |
| @param source_table string, name of the input table |
| @param out_table string, name of the output table to be created |
| @param dependent_varname: string, Column containing the dependent variable |
| @param independent_varname string, Column containing the array of independent variables |
| @param ref_category int, Reference category for multinomial logistic regression |
| @param grouping_cols string, Set of columns to group by. |
| @param optimizer_params: string, Comma-separated string of optimizer parameters |
| Supported parameters: |
| max_iter: Maximum number of iterations (Default = 20) |
| optimzer: Optimizer to be used (newton/irls, Default = irls) |
| tolerance: Residual tolerance (Default = 0.0001) |
| |
| To include an intercept in the model, set one coordinate in the |
| <tt>independentVariables</tt> array to 1. |
| |
| Returns: |
| None |
| """ |
| # Reset the message level to avoid random messages |
| old_msg_level = plpy.execute(""" |
| SELECT setting |
| FROM pg_settings |
| WHERE name='client_min_messages' |
| """)[0]['setting'] |
| if verbose_mode: |
| plpy.execute('SET client_min_messages TO warning') |
| else: |
| plpy.execute('SET client_min_messages TO error') |
| |
| #extract optimizer parameters |
| allowed_param_types = {'max_iter': int, 'max_num_iterations': int, |
| 'optimizer': str, |
| 'tolerance': float, 'precision': float} |
| default_optimizer_values = {'max_iter': 20, |
| 'optimizer': 'irls', |
| 'tolerance': 0.0001} |
| optimizer_param_dict = extract_keyvalue_params(optimizer_params, |
| allowed_param_types, |
| default_optimizer_values) |
| if not optimizer_param_dict: |
| optimizer_param_dict = default_optimizer_values |
| if 'precision' in optimizer_param_dict and \ |
| 'tolerance' not in optimizer_param_dict: |
| optimizer_param_dict['tolerance'] = optimizer_param_dict['precision'] |
| if 'max_num_iterations' in optimizer_param_dict and \ |
| 'max_iter' not in optimizer_param_dict: |
| optimizer_param_dict['max_iter'] = optimizer_param_dict['max_num_iterations'] |
| |
| if optimizer_param_dict['optimizer'].lower() == 'newton': |
| optimizer_param_dict['optimizer'] = 'irls' |
| |
| # Validate arguments |
| _robust_linregr_validate(schema_madlib, source_table, out_table, |
| dependent_varname, independent_varname, |
| grouping_cols, verbose_mode) |
| all_arguments = {'schema_madlib': schema_madlib, |
| 'source_table': source_table, |
| 'output_table': out_table, |
| 'dependent_varname': dependent_varname, |
| 'independent_varname': independent_varname, |
| 'ref_category': ref_category, |
| 'grouping_cols': grouping_cols} |
| all_arguments.update(optimizer_param_dict) |
| |
| _robust_mlogregr_validate(**all_arguments) |
| |
| out_table_summary=add_postfix(out_table, "_summary") |
| if not grouping_cols: |
| mlog_out_table = "pg_temp." + unique_string() |
| mlog_out_table_summary=add_postfix(mlog_out_table, "_summary") |
| all_arguments['mlog_out_table'] = mlog_out_table |
| |
| # Run regression |
| plpy.execute(""" |
| SELECT {schema_madlib}.mlogregr_train( |
| '{source_table}', '{mlog_out_table}', |
| '{dependent_varname}', '{independent_varname}', {ref_category}, |
| 'max_iter={max_iter}, optimizer={optimizer}, tolerance={tolerance}') |
| """.format(**all_arguments)) |
| |
| plpy.execute(""" |
| CREATE TABLE {out_table_summary} AS |
| SELECT |
| source_table, |
| '{out_table}'::TEXT AS out_table, |
| dependent_varname, |
| independent_varname, |
| optimizer_params, |
| ref_category, |
| num_rows_processed, |
| num_missing_rows_skipped |
| FROM |
| {mlog_out_table_summary} |
| """.format(out_table=out_table, |
| out_table_summary=out_table_summary, |
| mlog_out_table_summary=mlog_out_table_summary)) |
| |
| all_arguments['num_categories'] = plpy.execute( |
| "SELECT count(DISTINCT {0}) as n_cat FROM {1}". |
| format(dependent_varname, source_table))[0]['n_cat'] |
| all_arguments['num_features'] = plpy.execute(""" |
| SELECT |
| array_upper({independent_varname}, 1) fnum |
| FROM {source_table} LIMIT 1 |
| """.format(**all_arguments))[0]['fnum'] |
| |
| # Run Robust Variance |
| plpy.execute(""" |
| CREATE TABLE {output_table} AS |
| SELECT |
| ({schema_madlib}.__mlogregr_format( |
| (result).coef, {num_features}, |
| {num_categories}, {ref_category}) |
| ).category AS category, |
| {ref_category} as ref_category, |
| ({schema_madlib}.__mlogregr_format( |
| (result).coef, {num_features}, |
| {num_categories}, {ref_category}) |
| ).coef AS coef, |
| ({schema_madlib}.__mlogregr_format( |
| (result).std_err, {num_features}, |
| {num_categories}, {ref_category}) |
| ).coef AS std_err, |
| ({schema_madlib}.__mlogregr_format( |
| (result).z_stats, {num_features}, |
| {num_categories}, {ref_category}) |
| ).coef AS z_stats, |
| ({schema_madlib}.__mlogregr_format( |
| (result).p_values, {num_features}, |
| {num_categories}, {ref_category}) |
| ).coef AS p_values |
| FROM |
| ( |
| SELECT |
| {schema_madlib}.robust_mlogregr( |
| ({dependent_varname})::INTEGER, |
| {num_categories}, |
| {ref_category}, |
| {independent_varname}, |
| (SELECT |
| {schema_madlib}.matrix_agg(coef ORDER BY category) |
| FROM {mlog_out_table}) |
| ) AS result |
| FROM {source_table} |
| ) t1 |
| """.format(**all_arguments)) |
| |
| # Remove the output table of mlogregr_train() |
| plpy.execute('DROP TABLE IF EXISTS ' + mlog_out_table) |
| |
| plpy.execute("SET client_min_messages TO %s" % old_msg_level) |
| # ======================================================================== |
| |
| |
| def robust_variance_mlogregr_new(schema_madlib, |
| model_table, |
| out_table, |
| **kwargs): |
| """ |
| Updated function interface to extract parameters from mlogregr_train outputs |
| |
| Input to this function is the model table obtained from mlogregr_train. All |
| necessary parameters are obtained from the model table. |
| |
| Args: |
| @param model_table: string, Name of the table returned by mlogregr_train |
| @param output_table: string, name of the output table to be created |
| """ |
| _assert(model_table is not None and |
| model_table.strip().lower() not in ('null', ''), |
| "Robust Variance error: Invalid model table name") |
| |
| _assert(table_exists(model_table), |
| "Robust Variance error: Model table {0} does not exist". |
| format(model_table)) |
| |
| model_table_summary = add_postfix(model_table, "_summary") |
| _assert(table_exists(model_table_summary), |
| "Robust Variance error: Model Summary table {0} does not exist". |
| format(model_table_summary)) |
| |
| _assert(out_table is not None and |
| out_table.strip().lower() not in ('null', ''), |
| "Robust Variance error: Invalid output table name") |
| |
| _assert(not table_exists(out_table, only_first_schema=True), |
| "Robust Variance error: Output table {0}" |
| " already exists".format(str(out_table))) |
| |
| _assert(columns_exist_in_table(model_table, |
| ['coef']), |
| "Robust Variance error: Invalid model data table" |
| " - column coef missing") |
| |
| _assert(columns_exist_in_table(model_table_summary, |
| ['source_table', 'dependent_varname', 'independent_varname', |
| 'optimizer_params', 'ref_category']), |
| "Robust Variance error: Invalid model summary table" |
| " - required columns missing") |
| |
| # info is a dict that contains source_table, ind_var, dep_var, |
| info = plpy.execute("SELECT * FROM {0}".format(model_table_summary))[0] |
| info['out_table'] = out_table |
| coef = string_to_array(plpy.execute("SELECT coef from {0}". |
| format(model_table))[0]['coef']) |
| robust_variance_mlogregr(schema_madlib=schema_madlib, regr_coef=coef, |
| verbose_mode=False, **info) |
| # ------------------------------------------------------------------------- |
| |
| |
| def robust_variance_mlogregr_help(schema_madlib, message, **kwargs): |
| """ |
| Help function for robust_variance_mlogregr |
| |
| Args: |
| @param schema_madlib |
| @param message: string, Help message string |
| @param kwargs |
| |
| Returns: |
| String. Help/usage information |
| """ |
| if not message: |
| help_string = """ |
| ----------------------------------------------------------------------- |
| SUMMARY |
| ----------------------------------------------------------------------- |
| Functionality: Calculate Huber-White robust statistics for multinomial logistic regression |
| |
| For more details on function usage: |
| SELECT {schema_madlib}.robust_variance_mlogregr('usage') |
| """ |
| elif message in ['usage', 'help', '?']: |
| help_string = """ |
| ----------------------------------------------------------------------- |
| USAGE |
| ----------------------------------------------------------------------- |
| SELECT {schema_madlib}.robust_variance_mlogregr( |
| 'source_table', -- Name of data table |
| 'out_table', -- Name of result table |
| 'dependent_varname', -- Name of column for dependent variables |
| 'independent_varname', -- Name of column for independent variables |
| ref_category, -- Reference category for the multinomial logistic regression |
| -- (Optional, DEFAULT=0) |
| 'grouping_cols', -- An expression list used to group the input dataset into discrete groups. |
| -- (Optional, DEFAUT=NULL. |
| -- Not currently implemented. Any non-NULL value is ignored.) |
| 'optimizer_params', -- The optimizer parameters as a comma-separated string |
| -- (Optional, DEFAULT: max_iter=20, optimizer=irls, tolerance=0.0001) |
| verbose_mode -- When TRUE, provides verbose output of the results of training. |
| (Optional, DEFAUT=FALSE) |
| ); |
| |
| ----------------------------------------------------------------------- |
| OUTUPT |
| ----------------------------------------------------------------------- |
| The output table ('out_table' above) has the following columns |
| coef DOUBLE PRECISION[], -- Coefficients of mlogit regression |
| std_err DOUBLE PRECISION[], -- Huber-White standard errors |
| z_stats DOUBLE PRECISION[], -- Z-stats of the standard errors |
| p_values DOUBLE PRECISION[] -- p-values of the standard errors |
| |
| The output summary table is the same as mlogregr_train(), see also: |
| SELECT mlogregr_train('usage'); |
| """ |
| else: |
| help_string = "No such option. Use {schema_madlib}.robust_variance_mlogregr()" |
| |
| return help_string.format(schema_madlib=schema_madlib) |