| """ |
| @file margins.py_in |
| |
| @brief Marginal Effects with Interactions: Contains the main interface function |
| and other functions that are common to the various methods and related to |
| database constructs. |
| |
| @namespace marginal |
| """ |
| import plpy |
| from utilities.utilities import _assert |
| from utilities.validate_args import table_exists |
| from utilities.validate_args import columns_exist_in_table |
| from utilities.validate_args import table_is_empty |
| |
| from utilities.utilities import py_list_to_sql_string |
| from utilities.utilities import add_postfix |
| from utilities.control import MinWarning |
| |
| from margins_builder import MarginalEffectsBuilder |
| from margins_builder import TermBase |
| |
| import re |
| #------------------------------------------------------------------------------ |
| |
| |
| def margins(schema_madlib, model_table, out_table, x_design=None, |
| source_table=None, marginal_vars=None, *args, **kwargs): |
| """ |
| Call the appropriate margins functions depending on the regression type |
| found in the model table. |
| |
| Args: |
| @param schema_madlib |
| @param model_table |
| @param out_table |
| @param x_design |
| @param source_table |
| @param marginal_vars |
| @param args |
| @param kwargs |
| |
| Returns: |
| None |
| """ |
| with MinWarning('warning'): |
| # 1) validate all arguments |
| margins_validate_model_table(model_table) |
| if model_table: |
| model_summary_table = add_postfix(model_table, "_summary") |
| else: |
| raise ValueError("Margins error: Invalid regression model table name!") |
| |
| if not source_table: |
| source_table = plpy.execute("SELECT source_table FROM {0}". |
| format(model_summary_table))[0]['source_table'] |
| |
| margins_validate_args(out_table, source_table, x_design) |
| |
| # 2) get the regression type ... |
| reg_type = plpy.execute( |
| "SELECT method from {0}".format(model_summary_table))[0]['method'] |
| margins_method = None |
| if not reg_type: |
| plpy.error("Margins error: Regression type cannot be obtained " |
| "from the model table. 'margins()' currently only " |
| "supported for linregr, logregr, mlogregr, or coxph") |
| elif reg_type in ("linregr", "linear", "linear_regression", |
| "linear regression"): |
| margins_method = margins_linregr |
| elif reg_type in ("logregr", "logistic", "logistic_regression", |
| "logistic regression"): |
| margins_method = margins_logregr |
| elif reg_type in ("mlogregr", "multilogistic", |
| "multilogistic_regression", |
| "multinomial logistic regression"): |
| margins_method = margins_mlogregr |
| elif reg_type in ("coxph", "cox_proportional_hazards", |
| "cox prop", "cox proportional hazards"): |
| margins_method = margins_coxph |
| else: |
| plpy.error(""" |
| Margins not supported for this model table. |
| Re-run training {linregr, logregr, mlogregr, coxph}_train |
| before using margins().""") |
| |
| # 3) ... and call the appropriate sub-function |
| margins_method(schema_madlib, model_table, out_table, source_table, |
| x_design, marginal_vars, *args, **kwargs) |
| #------------------------------------------------------------------------------ |
| |
| |
| def margins_help(schema_madlib, message, **kwargs): |
| """ |
| Help function for margins |
| |
| Args: |
| @param schema_madlib |
| @param message: string, Help message string |
| @param kwargs |
| |
| Returns: |
| String. Help/usage information |
| """ |
| if not message: |
| help_string = """ |
| ----------------------------------------------------------------------- |
| SUMMARY |
| ----------------------------------------------------------------------- |
| Functionality: Calculate marginal effects for binomial/multinomial logistic |
| regression with interaction terms. |
| A marginal effect (ME) or partial effect measures the effect on the |
| conditional mean of a response (dependent variable) for a change in one of the |
| regressors (independent variable). |
| |
| We currently only support margins for linear, logistic, |
| multinomial logistic regression, and cox proportional hazard. |
| |
| For more details on function usage: |
| SELECT {schema_madlib}.margins('usage') |
| """ |
| elif message in ['usage', 'help', '?']: |
| help_string = """ |
| ----------------------------------------------------------------------- |
| USAGE |
| ----------------------------------------------------------------------- |
| SELECT {schema_madlib}.margins( |
| 'model_table', -- Name of table containing regression model |
| 'output_table', -- Name of result table |
| 'x_design', -- Design of the independent variables |
| -- (Optional, if not provided or NULL then independent variable list |
| is assumed to have no interaction terms) |
| 'source_table', -- Source table to apply marginal effects on |
| -- (Optional, if not provided or NULL then assume training table as the source) |
| 'marginal_vars' -- Variable identifiers (strings same as those in x_design) to |
| -- calculate marginal effects on |
| -- (Optional, if not provided or NULL then marginal effects for all |
| -- basis variables will be returned) |
| ); |
| |
| ----------------------------------------------------------------------- |
| OUTPUT |
| ----------------------------------------------------------------------- |
| The output table ('output_table' above) has the following columns |
| variables INTEGER[], -- Indices of the basis variables, |
| -- will be same as marginal vars if provided |
| margins DOUBLE PRECISION[], -- Marginal effects |
| std_err DOUBLE PRECISION[], -- Standard errors using delta method |
| z_stats DOUBLE PRECISION[], -- z-stats of the standard errors |
| p_values DOUBLE PRECISION[], -- p-values of the standard errors |
| """ |
| else: |
| help_string = "No such option. Use {schema_madlib}.margins()" |
| |
| return help_string.format(schema_madlib=schema_madlib) |
| # --------------------------------------------------------------------- |
| |
| |
| def margins_validate_model_table(model_table): |
| """ |
| Args: |
| @param schema_madlib |
| @param model_table |
| |
| Returns: |
| |
| """ |
| _assert(model_table and |
| model_table.strip().lower() not in ('null', ''), |
| "Margins error: Invalid regression model table name!") |
| _assert(table_exists(model_table), |
| "Margins error: Specified Model table ({0}) is missing! " |
| "Rerun underlying regression".format(model_table)) |
| |
| model_summary_table = add_postfix(model_table, '_summary') |
| _assert(table_exists(model_summary_table), |
| "Margins error: Summary for model table ({0}) is missing! " |
| "Rerun underlying regression".format(model_summary_table)) |
| |
| _assert(columns_exist_in_table(model_summary_table, |
| ['method', 'source_table', 'out_table', 'dependent_varname', |
| 'independent_varname']), |
| "Margins error: Invalid model summary table ({0})" |
| " - some required columns missing".format(model_summary_table)) |
| return True |
| #------------------------------------------------------------------------------ |
| |
| |
| def margins_validate_args(out_table, source_table, x_design=None, |
| **kwargs): |
| _assert(out_table and |
| out_table.strip().lower() not in ('null', ''), |
| "Margins error: Invalid output table name!") |
| _assert(not table_exists(out_table, only_first_schema=True), |
| "Margins error: Output table already exists!") |
| |
| _assert(source_table and source_table.strip().lower() not in ('null', ''), |
| "Margins error: Invalid data table name!") |
| _assert(table_exists(source_table), |
| "Margins error: Data table ({0}) is missing!".format(source_table)) |
| _assert(not table_is_empty(source_table), |
| "Margins error: Data table ({0}) is empty!".format(source_table)) |
| # ------------------------------------------------------------------------- |
| |
| |
| def _get_categorical_set_unset_strings(x_design_parser, indep_name='x', shortened_output=False): |
| """ |
| Return strings that represent the discrete difference strings (set and unset) |
| string, along with the indices for all the categorical variables. |
| """ |
| subset_indicators = x_design_parser.get_subset_indicator_terms() |
| # subset_cat_indices represents the indicator variables |
| # that are present in x_design_parser.subset_basis |
| indicators_set_string = [] |
| indicators_unset_string = [] |
| create_indicator_string = lambda x: (py_list_to_sql_string(x, |
| array_type="double precision")) |
| for each_term in subset_indicators: |
| set_list, unset_list = x_design_parser.get_discrete_diff_arrays( |
| each_term, indep_name, shortened_output=shortened_output) |
| set_list[:] = [i for i in set_list if i is not None] |
| unset_list[:] = [i for i in unset_list if i is not None] |
| # create two matrices - 1 with all set expressions, |
| # 1 with all unset expressions |
| indicators_set_string.append(create_indicator_string(set_list)) |
| indicators_unset_string.append(create_indicator_string(unset_list)) |
| |
| # create strings that represent the categorical discrete differences |
| if subset_indicators: |
| return [py_list_to_sql_string(i, array_type="double precision") |
| for i in (indicators_set_string, indicators_unset_string)] |
| else: |
| return ("NULL::double precision[]", "NULL::double precision[]") |
| #------------------------------------------------------------------------------ |
| |
| |
| def _parse_marginal_vars(marginal_var_str): |
| """ |
| Marginal vars is supposed to be a list of identifiers separated by ','. |
| It's possible that the user inputs it as an array string |
| eg: ARRAY[1, 2, 3, 4] or '{2, 4, 5}'. We strip out the 'ARRAY' if present in |
| front and also strip square/curly brackets. |
| """ |
| if str.lower(marginal_var_str.strip()[:6]) == "array[": |
| marginal_var_str = marginal_var_str[5:] |
| marginal_var_str = marginal_var_str.strip(" {}[]") |
| marginal_var_list = [] |
| # reg = re.compile('".*?"|[^\,]+') |
| # all_elem = reg.findall(marginal_var_str.strip()) |
| comma_reg = re.compile(r'((?:[^,"]|"[^"]*")+)') |
| all_elem = [i.strip() for i in |
| comma_reg.split(marginal_var_str.strip())[1::2]] |
| for each_str in all_elem: |
| each_str = each_str.strip() |
| if not (each_str.startswith('"') and each_str.endswith('"')): |
| if not re.match(r'^\w+$', each_str): |
| raise ValueError("Invalid identifier '{0}' in marginal_var. " |
| "Quote an identifier with non-alphanumeric " |
| "characters using double quotes.". |
| format(each_str)) |
| each_str = each_str.lower() |
| # if each_str is not quoted then always keep a lower-case copy |
| # (i.e. each_str is case-insensitive) |
| # else: |
| # # if each_str is quoted then we keep the case intact, but |
| # # strip the double-quotes. |
| # each_str = each_str.strip('"') |
| marginal_var_list.append(each_str) |
| return marginal_var_list |
| #------------------------------------------------------------------------------ |
| |
| |
| def _get_regression_arguments(schema_madlib, model_table): |
| """ |
| Return all necessary argument values from the model and summary table of |
| underlying regression |
| |
| Args: |
| @params model_table: str, Name of the model table containing regression |
| model |
| Returns: |
| Tuple: |
| coef: list, Coefficient values from regression. |
| independent_varname: str, The string passed as independent_varname |
| in underlying regression. |
| grouping_col: str, If grouping was used in the underlying regression, |
| then this argument gives the grouping columns |
| as a comma-separated list. This value is None |
| if no grouping was used. |
| """ |
| if model_table: |
| model_summary_table = add_postfix(model_table, "_summary") |
| else: |
| raise ValueError("Margins error: Invalid regression model table name!") |
| |
| coef = plpy.execute("SELECT coef as x FROM {0}".format(model_table))[0]['x'] |
| independent_varname = plpy.execute("SELECT independent_varname as x FROM {0}". |
| format(model_summary_table))[0]['x'] |
| if columns_exist_in_table(model_summary_table, ['grouping_col'], schema_madlib): |
| grouping_col = plpy.execute("SELECT grouping_col AS x FROM {0}". |
| format(model_summary_table))[0]['x'] |
| else: |
| grouping_col = None |
| if any(i is None for i in (coef, independent_varname)): |
| plpy.error("Margins error: Model table ({0}) is missing important parameters. " |
| "Rerun logistic regression.".format(model_table)) |
| return coef, independent_varname, grouping_col |
| #------------------------------------------------------------------------------ |
| |
| |
| def _get_parser(coef, x_design, marginal_vars=None): |
| """ |
| Args: |
| @param x_design: str, String representation of the design elements |
| (see MarginalEffectsBuilder for example) |
| @param marginal_vars: str, comma-separated list of integers - indexes |
| of independent variables to output. |
| If None, then all variables are output. |
| |
| Returns: |
| Tuple. |
| x_design_parser: Object representing a marginal effects parser |
| marginal_var_list: List of marginal variables |
| |
| """ |
| if marginal_vars: |
| marginal_vars = marginal_vars.strip() |
| |
| n_indep = len(coef) |
| if not x_design: |
| x_design = ','.join(str(i+1) for i in range(n_indep)) |
| |
| x_design_parser = MarginalEffectsBuilder(x_design) |
| |
| _assert(x_design_parser.n_total_terms == n_indep, |
| "Margins error: Invalid x_design. Number of terms in x_design ({0}) " |
| "not same as number of independent variables ({1})". |
| format(x_design_parser.n_total_terms, n_indep)) |
| |
| if marginal_vars: |
| # basis terms are not necessarily continguous |
| # example: basis_terms = [1, 2, 3, 5, 8, 11] |
| # If all variables are needed in output then |
| # marginal_var_list would be [1, 2, 3, 4, 5, 6] (max value would be len(basis_terms)) |
| marginal_var_list = _parse_marginal_vars(marginal_vars) |
| _assert(all(TermBase(x) in x_design_parser.inverse_basis_terms |
| for x in marginal_var_list), |
| "Margins error: marginal_vars argument contains values not " |
| "present as a basis variable in x_design") |
| else: |
| # marginal_var_list |
| marginal_var_list = x_design_parser.get_sorted_basis_identifiers() |
| |
| # do not return any answer for reference terms |
| ref_terms_removed = [] |
| if x_design_parser.reference_terms: |
| for each_ref in x_design_parser.reference_terms.values(): |
| if each_ref.identifier in marginal_var_list: |
| marginal_var_list.remove(each_ref.identifier) |
| ref_terms_removed.append(each_ref.identifier) |
| |
| x_design_parser.subset_basis = marginal_var_list |
| if ref_terms_removed: |
| plpy.warning(""" |
| Warning: Reference terms (Variable(s): {ref_terms}) are present in output variables. " |
| No marginal effects (and statistics) will be provided for these terms. |
| """.format(ref_terms=', '.join(str(i) for i in ref_terms_removed))) |
| |
| return x_design_parser |
| #------------------------------------------------------------------------------ |
| |
| |
| #------------------------------------------------------------------------------ |
| # Linear regression marginal effects |
| #------------------------------------------------------------------------------ |
| def margins_linregr(schema_madlib, model_table, out_table, source_table, |
| x_design=None, marginal_vars=None, *args, **kwargs): |
| """ |
| Args: |
| @param schema_madlib |
| @param model_table |
| @param out_table |
| @param x_design |
| @param source_table |
| @param marginal_vars |
| |
| Returns: |
| None |
| |
| Raises: |
| |
| """ |
| # 1) Get all arguments and validate them |
| coef, independent_varname, grouping_col = _get_regression_arguments(schema_madlib, model_table) |
| |
| # 2) Parse x_design and marginal_vars argument |
| x_design_parser = _get_parser(coef, x_design, marginal_vars) |
| # subset_basis contains indices for a 1-base array |
| if not x_design_parser.subset_basis: |
| raise ValueError("Invalid marginal variables selected.") |
| |
| # 3) Create a matrix representing the 2nd partial derivative |
| # (1st order wrt x and 2nd order wrt \beta). For categorical, this is |
| # the discrete difference. |
| indep_name = 'x' |
| derivative_list = x_design_parser.create_2nd_derivative_matrix(indep_name) |
| derivative_str = py_list_to_sql_string( |
| [py_list_to_sql_string(i, array_type="double precision") |
| for i in derivative_list], |
| array_type="double precision") |
| # grouping arguments |
| if not grouping_col: |
| grouping_str1 = "" |
| grouping_str2 = "" |
| using_str = "" |
| join_str = "," |
| else: |
| grouping_str1 = grouping_col + "," |
| grouping_str2 = "GROUP BY " + grouping_col |
| using_str = "USING (" + grouping_col + ")" |
| join_str = "LEFT OUTER JOIN " |
| |
| execute_string = """ |
| CREATE TABLE {out_table} AS |
| SELECT |
| {grouping_str1} |
| {index_array} as variables, |
| (res).margins as margins, |
| (res).std_err as std_err, |
| (res).z_stats as z_stats, |
| (res).p_values as p_values |
| FROM ( |
| SELECT |
| {grouping_str1} |
| {schema_madlib}.__margins_int_linregr_agg( |
| {indep_name} |
| , coef |
| , variance_covariance |
| , {derivative_str}) |
| as res |
| FROM ( |
| SELECT |
| {grouping_str1} |
| {independent_varname} as {indep_name} |
| FROM {source_table} as s |
| ) q1 {join_str} {model_table} as m {using_str} |
| {grouping_str2} |
| ) q2 |
| """.format(schema_madlib=schema_madlib, |
| source_table=source_table, |
| out_table=out_table, |
| independent_varname=independent_varname, |
| indep_name=indep_name, |
| index_array=py_list_to_sql_string( |
| [x_design_parser.basis_terms[i] |
| for i in x_design_parser.subset_basis], |
| array_type="text"), |
| model_table=model_table, |
| derivative_str=derivative_str, |
| grouping_str1=grouping_str1, |
| grouping_str2=grouping_str2, |
| using_str=using_str, |
| join_str=join_str) |
| plpy.execute(execute_string) |
| |
| |
| #------------------------------------------------------------------------------ |
| # Logistic regression marginal effects |
| #------------------------------------------------------------------------------ |
| def margins_logregr(schema_madlib, model_table, out_table, source_table, |
| x_design=None, marginal_vars=None, |
| *args, **kwargs): |
| """ |
| Args: |
| @param schema_madlib |
| @param model_table |
| @param out_table |
| @param x_design |
| @param source_table |
| @param marginal_vars |
| |
| Returns: |
| None |
| |
| Raises: |
| ValueError |
| """ |
| # 1) Get all arguments and validate them |
| coef, independent_varname, grouping_col = _get_regression_arguments(schema_madlib, model_table) |
| |
| # 2) Get the x_design and marginal_vars arguments |
| x_design_parser = _get_parser(coef, x_design, marginal_vars) |
| # subset_basis contains indices for a 1-base array |
| if not x_design_parser.subset_basis: |
| raise ValueError("Invalid marginal variables selected.") |
| |
| # 3) Create a matrix representing the 2nd partial derivative |
| # (1st order wrt x and 2nd order wrt \beta) |
| indep_name = 'x' |
| if x_design_parser.contains_interaction(): |
| derivative_list = x_design_parser.create_2nd_derivative_matrix(indep_name) |
| derivative_str = py_list_to_sql_string( |
| [py_list_to_sql_string(i, array_type="double precision") |
| for i in derivative_list], |
| array_type="double precision") |
| else: |
| derivative_str = "NULL::double precision[]" |
| |
| # 4) Build the margins for indicator variables as discrete differences |
| set_string, unset_string = _get_categorical_set_unset_strings( |
| x_design_parser, indep_name, shortened_output=True) |
| |
| if x_design_parser.contains_indicators(): |
| zero_base_categorical_indices = py_list_to_sql_string( |
| [i - 1 for i in x_design_parser.get_all_indicator_indices()], |
| array_type="double precision") |
| else: |
| zero_base_categorical_indices = "NULL::double precision[]" |
| |
| # grouping arguments |
| if not grouping_col: |
| grouping_str1 = "" |
| grouping_str2 = "" |
| using_str = "" |
| join_str = "," |
| else: |
| grouping_str1 = grouping_col + "," |
| grouping_str2 = "GROUP BY " + grouping_col |
| using_str = "USING (" + grouping_col + ")" |
| join_str = "LEFT OUTER JOIN " |
| |
| execute_string = """ |
| CREATE TABLE {out_table} AS |
| SELECT |
| {grouping_str1} |
| {index_array} as variables, |
| (res).margins as margins, |
| (res).std_err as std_err, |
| (res).z_stats as z_stats, |
| (res).p_values as p_values |
| FROM ( |
| SELECT |
| {grouping_str1} |
| {schema_madlib}.__margins_int_logregr_agg( |
| {indep_name} |
| , coef |
| , variance_covariance |
| , {zero_base_subset_indices} |
| , {derivative_str} |
| , {zero_base_categorical_indices} |
| , {set_string} |
| , {unset_string}) |
| as res |
| FROM ( |
| SELECT |
| {grouping_str1} |
| {independent_varname} as {indep_name} |
| FROM {source_table} as s |
| ) q1 {join_str} {model_table} as m {using_str} |
| {grouping_str2} |
| ) q2 |
| """.format(schema_madlib=schema_madlib, |
| source_table=source_table, |
| out_table=out_table, |
| independent_varname=independent_varname, |
| indep_name=indep_name, |
| index_array=py_list_to_sql_string( |
| [x_design_parser.basis_terms[i] |
| for i in x_design_parser.subset_basis], |
| array_type="text"), |
| zero_base_categorical_indices=zero_base_categorical_indices, |
| model_table=model_table, |
| derivative_str=derivative_str, |
| set_string=set_string, |
| unset_string=unset_string, |
| grouping_str1=grouping_str1, |
| grouping_str2=grouping_str2, |
| using_str=using_str, |
| join_str=join_str, |
| zero_base_subset_indices=py_list_to_sql_string( |
| [i - 1 for i in x_design_parser.subset_basis], |
| array_type="double precision")) |
| plpy.execute(execute_string) |
| |
| |
| #------------------------------------------------------------------------------ |
| # Multinomial Logistic regression marginal effects |
| #------------------------------------------------------------------------------ |
| def margins_mlogregr(schema_madlib, model_table, out_table, source_table, |
| x_design=None, marginal_vars=None, |
| *args, **kwargs): |
| """ |
| Args: |
| @param schema_madlib |
| @param model_table |
| @param out_table |
| @param x_design |
| @param source_table |
| @param marginal_vars |
| |
| Returns: |
| None |
| |
| Raises: |
| |
| """ |
| # 1) Get all arguments and validate them |
| coef, independent_varname, grouping_col = _get_regression_arguments(schema_madlib, model_table) |
| summary_table = add_postfix(model_table, "_summary") |
| |
| # 2) Parse x_design and marginal_vars argument |
| x_design_parser = _get_parser(coef, x_design, marginal_vars) |
| if not x_design_parser.subset_basis: |
| raise ValueError("Invalid marginal variables selected.") |
| |
| # 3) Create a matrix representing the 2nd partial derivative |
| # (1st order wrt x and 2nd order wrt \beta) |
| indep_name = 'x' |
| if x_design_parser.contains_interaction(): |
| derivative_list = x_design_parser.create_2nd_derivative_matrix(indep_name) |
| derivative_str = py_list_to_sql_string( |
| [py_list_to_sql_string(i, array_type="double precision") |
| for i in derivative_list], |
| array_type="double precision") |
| else: |
| derivative_str = "NULL::double precision[]" |
| |
| # 4) Build the margins for indicator variables as discrete differences |
| set_string, unset_string = _get_categorical_set_unset_strings( |
| x_design_parser, indep_name, shortened_output=True) |
| |
| if x_design_parser.contains_indicators(): |
| zero_base_categorical_indices = py_list_to_sql_string( |
| [i - 1 for i in x_design_parser.get_all_indicator_indices()], |
| array_type="double precision") |
| else: |
| zero_base_categorical_indices = "NULL::double precision[]" |
| |
| execute_string = """ |
| CREATE TABLE {out_table} AS |
| SELECT |
| category, |
| ref_category, |
| {index_array} as variables, |
| margins, |
| std_err, |
| z_stats, |
| p_values |
| FROM |
| ( |
| SELECT |
| ref_category, |
| ({schema_madlib}.__mlogregr_format( |
| (res).margins, {num_basis}, num_categories, ref_category) |
| ).category, |
| ({schema_madlib}.__mlogregr_format( |
| (res).margins, {num_basis}, num_categories, ref_category) |
| ).coef as margins, |
| ({schema_madlib}.__mlogregr_format( |
| (res).std_err, {num_basis}, num_categories, ref_category) |
| ).coef as std_err, |
| ({schema_madlib}.__mlogregr_format( |
| (res).z_stats, {num_basis}, num_categories, ref_category) |
| ).coef as z_stats, |
| ({schema_madlib}.__mlogregr_format( |
| (res).p_values, {num_basis}, num_categories, ref_category) |
| ).coef as p_values |
| FROM |
| ( |
| SELECT |
| num_categories, |
| ref_category, |
| {schema_madlib}.__margins_int_mlogregr_agg( |
| {indep_name} |
| , coef |
| , variance_covariance |
| , {zero_base_subset_indices} |
| , {derivative_str} |
| , {zero_base_categorical_indices} |
| , {set_string} |
| , {unset_string}) |
| as res |
| FROM |
| ( |
| SELECT |
| {independent_varname} as {indep_name} |
| FROM {source_table} as s |
| ) q1, {summary_table} as m |
| GROUP BY num_categories, ref_category |
| ) q2 |
| ) q3 |
| """.format(schema_madlib=schema_madlib, |
| source_table=source_table, |
| out_table=out_table, |
| independent_varname=independent_varname, |
| indep_name=indep_name, |
| index_array=py_list_to_sql_string( |
| [x_design_parser.basis_terms[i] |
| for i in x_design_parser.subset_basis], |
| array_type="text"), |
| zero_base_subset_indices=py_list_to_sql_string( |
| [i - 1 for i in x_design_parser.subset_basis], |
| array_type="double precision"), |
| zero_base_categorical_indices=zero_base_categorical_indices, |
| summary_table=summary_table, |
| derivative_str=derivative_str, |
| set_string=set_string, |
| unset_string=unset_string, |
| num_basis=len(x_design_parser.subset_basis)) |
| plpy.execute(execute_string) |
| # ------------------------------------------------------------------------- |
| |
| |
| #------------------------------------------------------------------------------ |
| # Cox Proportional Hazards marginal effects |
| #------------------------------------------------------------------------------ |
| def margins_coxph(schema_madlib, model_table, out_table, source_table, |
| x_design=None, marginal_vars=None, |
| *args, **kwargs): |
| """ |
| Args: |
| @param schema_madlib |
| @param model_table |
| @param out_table |
| @param x_design |
| @param source_table |
| @param marginal_vars |
| |
| Returns: |
| None |
| |
| Raises: |
| |
| """ |
| # 1) Get all arguments and validate them |
| coef, independent_varname, grouping_col = _get_regression_arguments(schema_madlib, model_table) |
| summary_table = add_postfix(model_table, "_summary") |
| |
| strata_cols = plpy.execute("SELECT strata from " + summary_table)[0]["strata"] |
| |
| if not strata_cols: |
| strata_col_str = "" |
| strata_cols = "" |
| else: |
| strata_col_str = "GROUP BY " + strata_cols |
| strata_cols += "," |
| |
| # 2) Parse x_design and marginal_vars argument |
| x_design_parser = _get_parser(coef, x_design, marginal_vars) |
| if not x_design_parser.subset_basis: |
| raise ValueError("Invalid marginal variables selected.") |
| |
| # 3) Create a matrix representing the 2nd partial derivative |
| # (1st order wrt x and 2nd order wrt \beta) |
| indep_name = 'x' |
| if x_design_parser.contains_interaction(): |
| derivative_list = x_design_parser.create_2nd_derivative_matrix(indep_name) |
| derivative_str = py_list_to_sql_string( |
| [py_list_to_sql_string(i, array_type="double precision") |
| for i in derivative_list], |
| array_type="double precision") |
| else: |
| derivative_str = "NULL::double precision[]" |
| |
| # 4) Build the margins for indicator variables as discrete differences |
| set_string, unset_string = _get_categorical_set_unset_strings( |
| x_design_parser, indep_name, shortened_output=True) |
| |
| if x_design_parser.contains_indicators(): |
| zero_base_categorical_indices = py_list_to_sql_string( |
| [i - 1 for i in x_design_parser.get_all_indicator_indices()], |
| array_type="double precision") |
| else: |
| zero_base_categorical_indices = "NULL::double precision[]" |
| |
| execute_string = """ |
| CREATE TABLE {out_table} AS |
| SELECT |
| {index_array} as variables, |
| (res).margins as margins, |
| (res).std_err as std_err, |
| (res).z_stats as z_stats, |
| (res).p_values as p_values |
| FROM ( |
| SELECT |
| {schema_madlib}.__margins_int_coxph_agg( |
| {indep_name} |
| , coef |
| , hessian |
| , {zero_base_subset_indices} |
| , {derivative_str} |
| , {zero_base_categorical_indices} |
| , {set_string} |
| , {unset_string}) |
| as res |
| FROM ( |
| SELECT |
| {independent_varname} as {indep_name} |
| FROM {source_table} as s |
| ) q1, {model_table} as m |
| ) q2 |
| """.format(schema_madlib=schema_madlib, |
| source_table=source_table, |
| out_table=out_table, |
| independent_varname=independent_varname, |
| indep_name=indep_name, |
| index_array=py_list_to_sql_string( |
| [x_design_parser.basis_terms[i] |
| for i in x_design_parser.subset_basis], |
| array_type="text"), |
| strata_col_str=strata_col_str, |
| strata_cols=strata_cols, |
| model_table=model_table, |
| zero_base_subset_indices=py_list_to_sql_string( |
| [i - 1 for i in x_design_parser.subset_basis], |
| array_type="double precision"), |
| zero_base_categorical_indices=zero_base_categorical_indices, |
| derivative_str=derivative_str, |
| set_string=set_string, |
| unset_string=unset_string, |
| num_basis=len(x_design_parser.subset_basis)) |
| plpy.execute(execute_string) |
| # ------------------------------------------------------------------------- |