| |
| # ------------------------------------------------------------------------ |
| # Compute clustered errors |
| # ------------------------------------------------------------------------ |
| |
| import plpy |
| from utilities.utilities import __unique_string |
| from utilities.utilities import _array_to_string |
| from utilities.utilities import _string_to_array |
| from utilities.validate_args import columns_exist_in_table |
| from utilities.validate_args import table_is_empty |
| from utilities.validate_args import scalar_col_has_no_null |
| from utilities.validate_args import table_exists |
| |
| # ======================================================================== |
| |
| def __prepare_strings(clustervar, grouping_col): |
| """ |
| Prepare the needed strings |
| """ |
| fitres = __unique_string() # result table for linear regression |
| |
| if grouping_col is None: |
| grouping_col_str = "NULL" |
| else: |
| grouping_col_str = "'" + grouping_col + "'" |
| |
| coef_str = __unique_string() |
| if clustervar is None: |
| cluster_grouping_str = "group by {coef_str}".format(coef_str = coef_str) |
| else: |
| cluster_grouping_str = "group by {coef_str}, ".format(coef_str = coef_str) + clustervar |
| |
| return (fitres, coef_str, cluster_grouping_str, grouping_col_str) |
| |
| # ------------------------------------------------------------------------ |
| |
| def __generate_clustered_sql (**kwargs): |
| """ |
| Create the SQL query to execute and create the result table |
| """ |
| |
| |
| sqlPart1 = """ |
| create table {tbl_output} as |
| select (f).* from ( |
| select {schema_madlib}.__clustered_{regr_type}_compute_stats( |
| max(coef), |
| m4_ifdef(`__POSTGRESQL__', `{schema_madlib}.__array_')sum(meatvec), |
| m4_ifdef(`__POSTGRESQL__', `{schema_madlib}.__array_')sum(breadvec), |
| count(numRows)::integer, (sum(numRows))::integer) as f |
| from ( |
| select (g).*, coef, numRows from ( |
| select""".format(**kwargs) |
| |
| if(kwargs['regr_type'] == 'mlog'): #We need to know the reference category and number of categories if doing the mlog regression |
| sqlPart2=""" {schema_madlib}.__clustered_err_{regr_type}_step({depvar}, |
| {indvar}, {coef_str}, (select count(distinct {depvar}) from {tbl_data})::INTEGER, ({ref_category})::INTEGER ) as g,""".format(**kwargs) |
| else: |
| sqlPart2=""" {schema_madlib}.__clustered_err_{regr_type}_step({depvar}, |
| {indvar}, {coef_str}) as g,""".format(**kwargs) |
| |
| sqlPart3=""" {coef_str} as coef, |
| count({depvar}) as numRows |
| from ( |
| select u.coef as {coef_str}, v.* |
| from |
| {fitres} u, {tbl_data} v |
| ) s |
| {cluster_grouping_str}) t) p) q |
| """.format(**kwargs) |
| return sqlPart1 + sqlPart2 + sqlPart3 |
| |
| |
| # ======================================================================== |
| |
| def clustered_variance_linregr (schema_madlib, tbl_data, tbl_output, |
| depvar, indvar, clustervar, grouping_col, |
| **kwargs): |
| """ |
| Linear regression clustered standard errors |
| """ |
| old_msg_level = plpy.execute("select setting from pg_settings where \ |
| name='client_min_messages'")[0]['setting'] |
| plpy.execute("set client_min_messages to error") |
| |
| validate_args_clustered_variance_linregr(schema_madlib, tbl_data, |
| tbl_output, depvar, indvar, |
| clustervar, grouping_col) |
| |
| (fitres, coef_str, cluster_grouping_str, |
| grouping_col_str) = __prepare_strings(clustervar, grouping_col) |
| |
| plpy.execute( |
| """ |
| select {schema_madlib}.linregr_train('{tbl_data}', |
| '{fitres}', '{depvar}', '{indvar}', {grouping_col}) |
| """.format(schema_madlib = schema_madlib, tbl_data = tbl_data, |
| fitres = fitres, depvar = depvar, indvar = indvar, |
| grouping_col = grouping_col_str)) |
| |
| plpy.execute( |
| __generate_clustered_sql(schema_madlib = schema_madlib, depvar = depvar, |
| indvar = indvar, coef_str = coef_str, |
| tbl_data = tbl_data, fitres = fitres, |
| cluster_grouping_str = cluster_grouping_str, |
| tbl_output = tbl_output, regr_type = "lin")) |
| |
| plpy.execute( |
| """ |
| drop table if exists {fitres} |
| """.format(fitres = fitres)) |
| |
| plpy.execute("set client_min_messages to " + old_msg_level) |
| return None |
| |
| # ======================================================================== |
| |
| def validate_args_clustered_variance (schema_madlib, tbl_data, |
| tbl_output, depvar, indvar, |
| clustervar, grouping_col): |
| """ |
| Validate the parameters |
| """ |
| if not tbl_data or tbl_data in ('null', '') or not table_exists(tbl_data): |
| plpy.error("Clustered variance estimation error: Data table does not exist!") |
| |
| if table_is_empty(tbl_data): |
| plpy.error("Clustered variance estimation error: Data table is empty!") |
| |
| if tbl_output is None or tbl_output.lower() in ('null', ''): |
| plpy.error("Clustered variance estimation error: Invalid output table name!") |
| |
| if table_exists(tbl_output): |
| plpy.error("Clustered variance estimation error: Output table exists!") |
| |
| if depvar is None or (not isinstance(depvar, str)) or depvar.lower() in ('null', ''): |
| plpy.error("Clustered variance estimation error: Invalid dependent column name!") |
| |
| if indvar is None or (not isinstance(indvar, str)) or indvar.lower() in ('null', ''): |
| plpy.error("Clustered variance estimation error: Invalid independent column name!") |
| |
| # try: |
| # plpy.execute("select {indvar}".format(indvar = indvar)) |
| # isconst = True |
| # except: |
| # isconst = False |
| # if isconst: |
| # plpy.error("Clustered variance estimation error: independent variable is a constant!") |
| |
| # try: |
| # plpy.execute("select {depvar}".format(depvar = depvar)) |
| # notconst = False |
| # except: |
| # notconst = True |
| # if notconst is False: |
| # plpy.error("Clustered variance estimation error: dependent variable is a constant!") |
| |
| # try: |
| # plpy.execute("select {indvar} from {tbl} limit 1".format(indvar = indvar, tbl = tbl_data)) |
| # success = True |
| # except: |
| # success = False |
| # if success is False: |
| # plpy.error("Clustered variance estimation error: independent variable does not exist in the data table!") |
| |
| # try: |
| # plpy.execute("select {depvar} from {tbl} limit 1".format(depvar = depvar, tbl = tbl_data)) |
| # success = True |
| # except: |
| # success = False |
| # if not success: |
| # plpy.error("Clustered variance estimation error: dependent variable does not exist in the data table!") |
| |
| if not scalar_col_has_no_null(tbl_data, depvar): |
| plpy.error("Clustered variance estimation error: Dependent variable has Null values! \ |
| Please filter out Null values before using this function!") |
| |
| if clustervar is None or clustervar.lower() in ('null', ''): |
| # clustervar is optional but if provided should be valid column name |
| plpy.error("Clustered variance estimation error: Invalid cluster columns name!") |
| |
| if clustervar is not None: |
| if not columns_exist_in_table(tbl_data, |
| _string_to_array(clustervar), schema_madlib): |
| plpy.error("Clustered variance estimation error: Cluster column does not exist!") |
| |
| if grouping_col is not None and grouping_col.lower() in ('null', ''): |
| # grouping_col is optional but if provided should be valid column name |
| plpy.error("Clustered variance estimation error: Invalid grouping columns name!") |
| |
| if grouping_col: |
| if not columns_exist_in_table(tbl_data, |
| _string_to_array(grouping_col), schema_madlib): |
| plpy.error("Clustered variance estimation error: Grouping column does not exist!") |
| |
| |
| # ======================================================================== |
| |
| def validate_args_clustered_variance_linregr(schema_madlib, tbl_data, |
| tbl_output, depvar, indvar, |
| clustervar, grouping_col): |
| """ |
| Validate the parameters |
| """ |
| validate_args_clustered_variance(schema_madlib, tbl_data, |
| tbl_output, depvar, indvar, |
| clustervar, grouping_col) |
| |
| # ======================================================================== |
| |
| def clustered_variance_linregr_help (schema_madlib, msg = None, **kwargs): |
| """ |
| Print help messages |
| """ |
| if msg is None or msg.strip(' ') == 'help' or msg.strip(' ') == '?': |
| return """ |
| ---------------------------------------------------------------- |
| Summary |
| ---------------------------------------------------------------- |
| Computes the clustered standard errors for linear regression. |
| |
| The function first runs a linear regression to get the fitting |
| coefficients. Then it computes the clustered standard errors for |
| the linear regression. |
| |
| SELECT {schema_madlib}.clustered_variance_linregr( |
| 'tbl_data', |
| 'tbl_output', |
| 'depvar', |
| 'indvar', |
| 'clustervar', |
| 'grouping_col' |
| ); |
| |
| -- |
| Run: |
| SELECT {schema_madlib}.clustered_variance_linregr('usage'); |
| |
| to get more information. |
| """.format(schema_madlib = schema_madlib) |
| |
| if msg.strip(' ') == 'usage': |
| return """ |
| Usage: |
| ---------------------------------------------------------------- |
| SELECT {schema_madlib}.clustered_variance_linregr( |
| 'tbl_data', -- Name of data table |
| 'tbl_output', -- Name of result table (raise an error if it already exists) |
| 'depvar', -- Expression for dependent variable |
| 'indvar', -- Expression for independent variables |
| 'clustervar', -- Column names for cluster variables, separated by comma |
| 'grouping_col' -- Grouping regression column names, separated by comma, default NULL |
| ); |
| |
| Output: |
| ---------------------------------------------------------------- |
| The output table has the following columns: |
| coef DOUBLE PRECISION[], -- Fitting coefficients |
| std_err DOUBLE PRECISION[], -- Clustered standard errors for coef |
| t_stats DOUBLE PRECISION[], -- t-stats of the errors |
| p_values DOUBLE PRECISION[] -- p-values of the errors |
| """.format(schema_madlib = schema_madlib) |
| |
| # ======================================================================== |
| # Clustered errors for logistic regression |
| # ======================================================================== |
| |
| def clustered_variance_logregr (schema_madlib, tbl_data, tbl_output, |
| depvar, indvar, clustervar, grouping_col, |
| max_iter, optimizer, tolerance, verbose, |
| **kwargs): |
| """ |
| Logistic regression clustered standard errors |
| """ |
| validate_args_clustered_variance_logregr(schema_madlib, tbl_data, |
| tbl_output, depvar, indvar, |
| clustervar, grouping_col, |
| max_iter, optimizer, tolerance, |
| verbose) |
| |
| (fitres, coef_str, cluster_grouping_str, |
| grouping_col_str) = __prepare_strings(clustervar, grouping_col) |
| |
| plpy.execute( |
| """ |
| select {schema_madlib}.logregr_train('{tbl_data}', |
| '{fitres}', '{depvar}', '{indvar}', {grouping_col}, |
| {max_iter}, '{optimizer}', {tolerance}, {verbose}) |
| """.format(schema_madlib = schema_madlib, tbl_data = tbl_data, |
| fitres = fitres, depvar = depvar, indvar = indvar, |
| grouping_col = grouping_col_str, max_iter = max_iter, |
| optimizer = optimizer, tolerance = tolerance, |
| verbose = verbose)) |
| |
| old_msg_level = plpy.execute("select setting from pg_settings where \ |
| name='client_min_messages'")[0]['setting'] |
| plpy.execute("set client_min_messages to error") |
| |
| plpy.execute( |
| __generate_clustered_sql(schema_madlib = schema_madlib, depvar = depvar, |
| indvar = indvar, coef_str = coef_str, |
| tbl_data = tbl_data, fitres = fitres, |
| cluster_grouping_str = cluster_grouping_str, |
| tbl_output = tbl_output, regr_type = "log")) |
| |
| plpy.execute( |
| """ |
| drop table if exists {fitres} |
| """.format(fitres = fitres)) |
| |
| plpy.execute("set client_min_messages to " + old_msg_level) |
| return None |
| |
| # ======================================================================== |
| |
| def validate_args_clustered_variance_logregr(schema_madlib, tbl_data, |
| tbl_output, depvar, indvar, |
| clustervar, grouping_col, |
| max_iter, optimizer, tolerance, |
| verbose): |
| """ |
| Validate the parameters |
| """ |
| validate_args_clustered_variance(schema_madlib, tbl_data, |
| tbl_output, depvar, indvar, |
| clustervar, grouping_col) |
| if max_iter is None or max_iter <= 0: |
| plpy.error("Clustered variance estimation error: Maximum number of iterations must be a positive number!") |
| |
| if tolerance is None or tolerance < 0: |
| plpy.error("Clustered variance estimation error: The tolerance must be a non-negative number!") |
| |
| if verbose not in [True, False]: |
| plpy.error("Clustered variance estimation error: verbose option must be a boolean!") |
| |
| if optimizer == "newton": |
| optimizer = "irls" |
| elif optimizer not in ("irls", "cg", "igd"): |
| plpy.error(""" Clustered variance estimation error: Unknown optimizer requested. |
| Must be 'newton'/'irls', 'cg', or 'igd'. |
| """) |
| |
| # ======================================================================== |
| |
| def clustered_variance_logregr_help (schema_madlib, msg = None, **kwargs): |
| """ |
| Print help messages |
| """ |
| if msg is None or msg.strip(' ') == 'help' or msg.strip(' ') == '?': |
| return """ |
| ---------------------------------------------------------------- |
| Summary |
| ---------------------------------------------------------------- |
| Computes the clustered standard errors for logistic regression. |
| |
| The function first runs a logistic regression to get the fitting |
| coefficients. Then it computes the clustered standard errors for |
| the logistic regression. |
| |
| SELECT {schema_madlib}.clustered_variance_logregr( |
| 'tbl_data', |
| 'tbl_output', |
| 'depvar', |
| 'indvar', |
| 'clustervar', |
| 'grouping_col', |
| max_iter, |
| 'optimizer', |
| tolerance, |
| verbose |
| ); |
| |
| -- |
| Run: |
| SELECT {schema_madlib}.clustered_variance_logregr('usage'); |
| |
| to get more information. |
| """.format(schema_madlib = schema_madlib) |
| |
| if msg.strip(' ') == 'usage': |
| return """ |
| Usage: |
| ---------------------------------------------------------------- |
| SELECT {schema_madlib}.clustered_variance_logregr( |
| 'tbl_data', -- Name of data table |
| 'tbl_output', -- Name of result table (raise an error if it already exists) |
| 'depvar', -- Expression for dependent variable |
| 'indvar', -- Expression for independent variables |
| 'clustervar', -- Column names for cluster variables, separated by comma |
| 'grouping_col', -- Grouping regression column names, separated by comma, default NULL |
| max_iter, -- Maximum iteration number for logistic regression, default 20 |
| 'optimizer', -- Optimization method for logistic regression, default 'irls' |
| tolerance, -- When difference of likelihoods in two consecutive iterations smaller than |
| -- this value, stops the computation. Default 0.0001 |
| verbose -- Whether print detailed information when computing logistic regression, |
| -- default is False |
| ); |
| |
| Output: |
| ---------------------------------------------------------------- |
| The output table has the following columns: |
| coef DOUBLE PRECISION[], -- Fitting coefficients |
| std_err DOUBLE PRECISION[], -- Clustered standard errors for coef |
| z_stats DOUBLE PRECISION[], -- z-stats of the errors |
| p_values DOUBLE PRECISION[] -- p-values of the errors |
| """.format(schema_madlib = schema_madlib) |
| |
| |
| # ======================================================================== |
| # Clustered errors for multi-logistic regression |
| # ======================================================================== |
| |
| #NOTE. Because of the current mlogregr interface, the "verbose" parameter doesn't actually do anything at the moment |
| def clustered_variance_mlogregr (schema_madlib, tbl_data, tbl_output, |
| depvar, indvar, clustervar, ref_category, grouping_col, |
| max_iter, optimizer, tolerance, verbose, |
| **kwargs): |
| """ |
| Multi-Logistic regression clustered standard errors |
| """ |
| validate_args_clustered_variance_mlogregr(schema_madlib, tbl_data, |
| tbl_output, depvar, indvar, |
| clustervar, grouping_col, |
| max_iter, optimizer, tolerance, |
| verbose) |
| |
| (fitres, coef_str, cluster_grouping_str, |
| grouping_col_str) = __prepare_strings(clustervar, grouping_col) |
| |
| old_msg_level = plpy.execute("select setting from pg_settings where \ |
| name='client_min_messages'")[0]['setting'] |
| plpy.execute("set client_min_messages to error") |
| |
| #The multi-logistic has a different interface than the rest of the regressions, so we have to create a table to hold the result |
| plpy.execute(""" |
| create table {fitres}(coef float8[]); |
| """.format(fitres = fitres)) |
| |
| plpy.execute( |
| """ |
| insert into {fitres} (select (select coef from {schema_madlib}.mlogregr('{tbl_data}', |
| '{depvar}', '{indvar}', |
| {max_iter}, '{optimizer}', {tolerance}, {ref_category}))) |
| """.format(schema_madlib = schema_madlib, tbl_data = tbl_data, |
| fitres = fitres, depvar = depvar, indvar = indvar, |
| max_iter = max_iter, |
| optimizer = optimizer, tolerance = tolerance, |
| ref_category = ref_category)) |
| |
| |
| plpy.execute( |
| __generate_clustered_sql(schema_madlib = schema_madlib, depvar = depvar, |
| indvar = indvar, coef_str = coef_str, |
| tbl_data = tbl_data, fitres = fitres, |
| cluster_grouping_str = cluster_grouping_str, |
| tbl_output = tbl_output, ref_category=ref_category, |
| regr_type = "mlog")) |
| |
| plpy.execute( |
| """ |
| drop table if exists {fitres} |
| """.format(fitres = fitres)) |
| |
| ##Add in the reference category to the output column. |
| plpy.execute( |
| """ |
| ALTER table {tbl_output} add column ref_category INTEGER; |
| """.format(tbl_output = tbl_output) |
| ) |
| |
| plpy.execute( |
| """ |
| update {tbl_output} set ref_category={ref_category}; |
| """.format(tbl_output = tbl_output, ref_category=ref_category) |
| ) |
| |
| plpy.execute("set client_min_messages to " + old_msg_level) |
| return None |
| |
| # ======================================================================== |
| |
| def validate_args_clustered_variance_mlogregr(schema_madlib, tbl_data, |
| tbl_output, depvar, indvar, |
| clustervar, grouping_col, |
| max_iter, optimizer, tolerance, |
| verbose): |
| """ |
| Validate the parameters |
| """ |
| validate_args_clustered_variance(schema_madlib, tbl_data, |
| tbl_output, depvar, indvar, |
| clustervar, grouping_col) |
| if max_iter <= 0: |
| plpy.error("Clustered variance estimation error: Maximum number of iterations must be positive!") |
| |
| if tolerance < 0: |
| plpy.error("Clustered variance estimation error: The tolerance cannot be negative!") |
| |
| if optimizer == "newton": |
| optimizer = "irls" |
| elif optimizer not in ("irls"): |
| plpy.error(""" Clustered variance estimation error: Unknown optimizer requested. |
| Must be 'newton'/'irls'. |
| """) |
| |
| # ======================================================================== |
| |
| def clustered_variance_mlogregr_help (schema_madlib, msg = None, **kwargs): |
| """ |
| Print help messages |
| """ |
| if msg is None or msg.strip(' ') == 'help' or msg.strip(' ') == '?': |
| return """ |
| ---------------------------------------------------------------- |
| Summary |
| ---------------------------------------------------------------- |
| Computes the clustered standard errors for multi-logistic regression. |
| |
| The function first runs a multi-logistic regression to get the fitting |
| coefficients. Then it computes the clustered standard errors for |
| the multi-logistic regression. |
| |
| SELECT {schema_madlib}.clustered_variance_mlogregr( |
| 'tbl_data', |
| 'tbl_output', |
| 'depvar', |
| 'indvar', |
| 'clustervar', |
| 'ref_category', |
| 'grouping_col', |
| max_iter, |
| 'optimizer', |
| tolerance, |
| verbose |
| ); |
| |
| -- |
| Run: |
| SELECT {schema_madlib}.clustered_variance_mlogregr('usage'); |
| |
| to get more information. |
| """.format(schema_madlib = schema_madlib) |
| |
| if msg.strip(' ') == 'usage': |
| return """ |
| Usage: |
| ---------------------------------------------------------------- |
| SELECT {schema_madlib}.clustered_variance_mlogregr( |
| 'tbl_data', -- Name of data table |
| 'tbl_output', -- Name of result table (raise an error if it already exists) |
| 'depvar', -- Expression for dependent variable |
| 'indvar', -- Expression for independent variables |
| 'clustervar', -- Column names for cluster variables, separated by comma |
| 'ref_category', -- Reference category for the multi-logistic regression |
| 'grouping_col', -- Grouping regression column names, separated by comma, default NULL |
| max_iter, -- Maximum iteration number for logistic regression, default 20 |
| 'optimizer', -- Optimization method for logistic regression, default 'irls' |
| tolerance, -- When difference of likelihoods in two consecutive iterations smaller than |
| -- this value, stops the computation. Default 0.0001 |
| verbose -- Whether print detailed information when computing logistic regression, |
| -- default is False |
| ); |
| |
| Output: |
| ---------------------------------------------------------------- |
| The output table has the following columns: |
| ref_category INTEGER -- The reference category used in the multi-logistic regression |
| coef DOUBLE PRECISION[], -- Fitting coefficients |
| std_err DOUBLE PRECISION[], -- Clustered standard errors for coef |
| z_stats DOUBLE PRECISION[], -- z-stats of the errors |
| p_values DOUBLE PRECISION[] -- p-values of the errors |
| """.format(schema_madlib = schema_madlib) |