| # coding=utf-8 |
| |
| """ |
| @file glm.py_in |
| |
| @brief Generalized Linear Models: Driver functions |
| |
| @namespace glm |
| |
| @brief Generalized Linear Models: Driver functions |
| """ |
| import plpy |
| from utilities.in_mem_group_control import GroupIterationController |
| from utilities.utilities import unique_string |
| from utilities.validate_args import explicit_bool_to_text |
| from utilities.utilities import _string_to_array |
| from utilities.utilities import _string_to_array_with_quotes |
| from utilities.utilities import extract_keyvalue_params |
| from utilities.validate_args import input_tbl_valid |
| from utilities.validate_args import output_tbl_valid |
| from utilities.validate_args import cols_in_tbl_valid |
| from utilities.utilities import add_postfix |
| # ======================================================================== |
| |
| |
| def __compute_glm(arg_dict): |
| """ |
| Compute Generalized Linear Model coefficients |
| |
| This method serves as an interface to different optimization algorithms. |
| By default, iteratively reweighted least squares is used. |
| |
| @return Number of iterations that has been run |
| """ |
| |
| iterationCtrl = GroupIterationController(arg_dict) |
| with iterationCtrl as it: |
| it.iteration = 0 |
| while True: |
| it.update( |
| """ |
| {schema_madlib}.__glm_{family}_{link}_agg( |
| ({col_dep_var})::double precision, |
| ({col_ind_var})::double precision[], |
| {rel_state}.{col_grp_state}) |
| """) |
| if it.test( |
| """ |
| {iteration} >= {max_iter} |
| OR {schema_madlib}.__glm_loglik_diff( |
| _state_previous, _state_current) < {tolerance} |
| """): |
| it.final() |
| break |
| |
| return iterationCtrl.iteration |
| # ======================================================================== |
| |
| |
| def glm(schema_madlib, source_table, model_table, dependent_varname, |
| independent_varname, family_params=None, grouping_col=None, |
| optim_params=None, verbose=False, **kwargs): |
| """ |
| Train Genralized Linear Model |
| |
| @param schema_madlib Name of the MADlib schema, properly escaped/quoted |
| @param source_table Name of relation containing the training data |
| @param model_table Name of relation where model will be outputted |
| @param dependent_varname Name of dependent column in training data |
| @param independent_varname Name of independent column in training data (of type |
| DOUBLE PRECISION[]) |
| @param family_params Distribution of dependent variable |
| @param grouping_col String of comma delimited group-by columns |
| @param optim_params Parameters for optimizer |
| @param kwargs We allow the caller to specify additional arguments (all of |
| which will be ignored though). The purpose of this is to allow the |
| caller to unpack a dictionary whose element set is a superset of |
| the required arguments by this function. |
| |
| @return A composite value which is __glm_result defined in glm.sql_in |
| """ |
| __glm_validate_args(schema_madlib, source_table, model_table, dependent_varname, |
| independent_varname, grouping_col) |
| |
| family_params_dict = __extract_family_params(schema_madlib, family_params) |
| optim_params_dict = __extract_optim_params(schema_madlib, optim_params) |
| |
| return __glm_compute( |
| schema_madlib, source_table, model_table, dependent_varname, |
| independent_varname, grouping_col, family_params_dict, |
| optim_params_dict, verbose=verbose, **kwargs) |
| # ======================================================================== |
| |
| |
| def __glm_validate_args(schema_madlib, tbl_source, tbl_output, col_dep_var, |
| col_ind_var, grouping_col): |
| """ |
| Validate the arguments |
| """ |
| input_tbl_valid(tbl_source, 'GLM') |
| |
| output_tbl_valid(tbl_output, 'GLM') |
| |
| if col_dep_var is None or col_dep_var.strip() == '': |
| plpy.error("GLM error: Invalid dependent column name!") |
| |
| if col_ind_var is None or col_ind_var.strip() == '': |
| plpy.error("GLM error: Invalid independent column name!") |
| |
| if grouping_col: |
| cols_in_tbl_valid(tbl_source, _string_to_array_with_quotes(grouping_col), 'GLM') |
| intersect = frozenset(_string_to_array(grouping_col)).intersection( |
| frozenset(('coef', 'log_likelihood', 'std_err', 'z_stats', |
| 'p_values', 'odds_ratios', 'condition_no', |
| 'num_processed', 'num_missing_rows_skipped', |
| 'variance_covariance', 'dispersion', 't_stats'))) |
| if len(intersect) > 0: |
| plpy.error("GLM error: Conflicting grouping column name.\n" |
| "Predefined name(s) {0} are not allow!".format( |
| ', '.join(intersect))) |
| |
| return None |
| # ======================================================================== |
| |
| |
| def __extract_family_params(schema_madlib, family_params): |
| family_params_types = dict(family=str, link=str) |
| family_params_dict = extract_keyvalue_params(family_params, |
| family_params_types) |
| # we use the first element as the default link function for the family |
| family_link = dict( |
| poisson=["log", "identity", "sqrt"], |
| gaussian=["identity", "log", "inverse"], |
| gamma=["inverse", "log", "identity"], |
| inverse_gaussian=["sqr_inverse", "identity", "log", "inverse"], |
| binomial=["logit", "probit"]) |
| for k, v in family_params_dict.iteritems(): |
| if k == "family": |
| if v not in family_link.keys(): |
| plpy.error("GLM error: {param_value} is not a valid " |
| "family!".format(param_value=v)) |
| |
| if "family" not in family_params_dict.keys(): |
| plpy.error("GLM error: Required parameter family is missing!") |
| if "link" in family_params_dict.keys(): |
| if family_params_dict["link"] not in family_link[family_params_dict["family"]]: |
| plpy.error("GLM error: Invalid link function {link_func} for " |
| "family {family}!".format(link_func=family_params_dict["link"], |
| family=family_params_dict["family"])) |
| else: |
| # default link function |
| family_params_dict["link"] = family_link[family_params_dict["family"]][0] |
| |
| return family_params_dict |
| # ======================================================================== |
| |
| |
| def __extract_optim_params(schema_madlib, optim_params, module='GLM'): |
| default_dict = dict(max_iter=100, optimizer='irls', tolerance=1e-6) |
| optim_params_types = dict(max_iter=int, optimizer=str, tolerance=float) |
| optim_params_dict = extract_keyvalue_params(optim_params, |
| optim_params_types, |
| default_dict) |
| |
| if optim_params_dict['max_iter'] <= 0: |
| plpy.error("{0} error: max_iter must be positive!".format(module)) |
| if optim_params_dict['optimizer'] != 'irls': |
| plpy.error("{0} error: optimizer must be irls!".format(module)) |
| if optim_params_dict['tolerance'] <= 0: |
| plpy.error("{0} error: tolerane must be positive!".format(module)) |
| |
| return optim_params_dict |
| # ======================================================================== |
| |
| |
| def __glm_compute(schema_madlib, tbl_source, tbl_output, col_dep_var, col_ind_var, |
| grouping_col, family_params, optim_params, verbose=False, **kwargs): |
| """ |
| Create an output table (drop if exists) that contains the Generalized Linear Model |
| """ |
| old_msg_level = plpy.execute(""" |
| SELECT setting FROM pg_settings |
| WHERE name='client_min_messages' |
| """)[0]['setting'] |
| if verbose: |
| plpy.execute("SET client_min_messages TO info") |
| else: |
| plpy.execute("SET client_min_messages TO warning") |
| |
| args = {'schema_madlib': schema_madlib, |
| 'rel_source': tbl_source, |
| 'tbl_output': tbl_output, |
| 'col_dep_var': col_dep_var, |
| 'col_ind_var': col_ind_var, |
| 'rel_state': unique_string(), |
| 'col_grp_iteration': unique_string(), |
| 'col_grp_state': unique_string(), |
| 'state_type': schema_madlib + ".bytea8" |
| } |
| args.update(optim_params) |
| args.update(family_params) |
| |
| # return an array of dict |
| # each dict has two elements: iteration number, and grouping value array |
| if grouping_col: |
| grouping_list = explicit_bool_to_text( |
| tbl_source, _string_to_array_with_quotes(grouping_col), schema_madlib) |
| for i in range(len(grouping_list)): |
| grouping_list[i] += "::text" |
| grouping_str = ','.join(grouping_list) |
| else: |
| grouping_col = None |
| grouping_str = "NULL" |
| args['grouping_col'] = grouping_col |
| args['grouping_str'] = grouping_str |
| |
| # for binomial distribution, the dependent variable is of type boolean. |
| # it's cast to integer here so that it can later be type cast to |
| # double precision before computation begins. |
| if family_params['family'] == 'binomial': |
| args['col_dep_var'] = "(" + col_dep_var + ")::integer" |
| |
| # REAL COMPUTATION |
| iteration_run = __compute_glm(args) |
| |
| if iteration_run >= optim_params['max_iter']: |
| plpy.warning("GLM warning: the computation did not converge in " + |
| str(optim_params['max_iter']) + " iterations!") |
| |
| # output table |
| grouping_str1 = "" if grouping_col is None else grouping_col + "," |
| grouping_str2 = "1 = 1" if grouping_col is None else grouping_col |
| using_str = "" if grouping_str1 == "" else "using (" + grouping_col + ")" |
| join_str = "," if grouping_str1 == "" else "join " |
| |
| if family_params['family'] in ['poisson', 'binomial']: |
| res_str = """ |
| (result).z_stats AS z_stats, |
| (result).p_values AS p_values, |
| (result).dispersion AS dispersion |
| """ |
| glm_result = "__glm_result_z_stats" |
| else: |
| res_str = """ |
| (result).z_stats AS t_stats, |
| (result).p_values AS p_values, |
| (result).dispersion AS dispersion |
| """ |
| glm_result = "__glm_result_t_stats" |
| |
| plpy.execute( |
| """ |
| CREATE TABLE {tbl_output} AS |
| SELECT |
| {grouping_str1} |
| (result).coef AS coef, |
| (result).loglik AS log_likelihood, |
| (result).std_err AS std_err, |
| {res_str}, |
| (CASE WHEN result IS NULL THEN 0 |
| ELSE (result).num_rows_processed |
| END)::bigint AS num_rows_processed, |
| (CASE WHEN result IS NULL THEN num_rows |
| ELSE num_rows - (result).num_rows_processed |
| END)::bigint AS num_rows_skipped, |
| {col_grp_iteration}::integer AS num_iterations |
| FROM ( |
| SELECT |
| {col_grp_iteration}, {grouping_str1} result, num_rows |
| FROM ( |
| ( SELECT |
| {grouping_str1} |
| {schema_madlib}.{glm_result}({col_grp_state}) AS result, |
| {col_grp_iteration} |
| FROM |
| {rel_state} |
| ) t |
| JOIN |
| ( SELECT |
| {grouping_str1} |
| max({col_grp_iteration}) AS {col_grp_iteration} |
| FROM {rel_state} |
| GROUP BY {grouping_str2} |
| ) s |
| USING ({grouping_str1} {col_grp_iteration}) |
| ) q1 |
| {join_str} |
| ( SELECT |
| {grouping_str1} |
| count(*) AS num_rows |
| FROM {rel_source} |
| GROUP BY {grouping_str2} |
| ) q2 |
| {using_str} |
| ) q3 |
| """.format(grouping_str1=grouping_str1, |
| grouping_str2=grouping_str2, |
| iteration_run=iteration_run, |
| using_str=using_str, |
| join_str=join_str, |
| res_str=res_str, |
| glm_result=glm_result, |
| **args)) |
| |
| # summary table |
| failed_groups = plpy.execute(""" |
| SELECT count(*) AS num_failed_groups |
| FROM {tbl_output} |
| WHERE coef IS NULL |
| """.format(**args))[0] |
| all_groups = plpy.execute(""" |
| SELECT count(*) AS num_all_groups |
| FROM {tbl_output} |
| """.format(**args))[0] |
| total_rows = plpy.execute(""" |
| SELECT |
| sum(num_rows_processed) AS total_rows_processed, |
| sum(num_rows_skipped) AS total_rows_skipped |
| FROM {tbl_output} |
| """.format(tbl_output=tbl_output))[0] |
| |
| args.update(failed_groups) |
| args.update(all_groups) |
| args.update(total_rows) |
| |
| tbl_output_summary = add_postfix(tbl_output, "_summary") |
| plpy.execute(""" |
| CREATE TABLE {tbl_output_summary} AS |
| SELECT |
| 'glm'::varchar AS method, |
| '{rel_source}'::varchar AS source_table, |
| '{tbl_output}'::varchar AS out_table, |
| $madlib_super_quote${dcol}$madlib_super_quote$::varchar |
| AS dependent_varname, |
| $madlib_super_quote${col_ind_var}$madlib_super_quote$::varchar |
| AS independent_varname, |
| 'family={family}, ' || |
| 'link={link}'::varchar AS family_params, |
| {g_str}::text AS grouping_col, |
| 'optimizer={optimizer}, ' || |
| 'max_iter={max_iter}, ' || |
| 'tolerance={tolerance}'::varchar AS optimizer_params, |
| {num_all_groups}::integer AS num_all_groups, |
| {num_failed_groups}::integer AS num_failed_groups, |
| {total_rows_processed}::bigint AS total_rows_processed, |
| {total_rows_skipped}::bigint AS total_rows_skipped |
| """.format(g_str="'" + grouping_col + "'" if grouping_col else "NULL", |
| tbl_output_summary=tbl_output_summary, |
| dcol=col_dep_var, |
| **args)) |
| |
| # clean up |
| plpy.execute("""DROP TABLE IF EXISTS pg_temp.{rel_state} """.format(**args)) |
| plpy.execute("SET client_min_messages TO " + old_msg_level) |
| return None |
| |
| |
| # ======================================================================== |
| |
| def glm_help_msg(schema_madlib, message, **kwargs): |
| """ Help message for generalized linear regression model |
| |
| @param message A string, the help message indicator |
| |
| Returns: |
| A string, contains the help message |
| """ |
| if not message: |
| |
| help_string = """ |
| ------------------------------------------------------------------ |
| SUMMARY |
| ------------------------------------------------------------------ |
| Generalized Linear Model: |
| |
| Function to fit a generalized linear model, relating responses to linear combinations |
| of predictor variables. |
| |
| For details on function usage: |
| """ |
| elif message in ['usage', 'help', '?']: |
| |
| help_string = """ |
| ------------------------------------------------------------------ |
| USAGE |
| ------------------------------------------------------------------ |
| SELECT {schema_madlib}.glm( |
| source_table, -- name of input table |
| model_table, -- name of model table |
| dependent_varname, -- name of dependent variable |
| independent_varname, -- names of independent variables |
| family_params, -- parameters for family distribution and link function |
| |
| usage: |
| 'family=<family_name>,link=<link_function_name>' |
| |
| supported values include: |
| 'family=poisson,link=identity|log|sqrt' (default link: log) |
| 'family=binomial,link=logit|probit' (default link: logit) |
| 'family=gaussian,link=identity|log|inverse' (default link: identity) |
| 'family=inverse_gaussian,link=identity|log|inverse|sqr_inverse' (default link: sqr_inverse i.e. 1/mu^2) |
| 'family=gamma,link=identity|log|inverse' (default link: inverse) |
| |
| grouping_col, -- optional, default NULL, names of columns to group-by |
| optimizer_params, -- optional, parameters for optimizer |
| |
| usage: |
| 'max_iter=<max_num_iterations>,optimizer=<optimizer_name>,tolerance=<tolerance_value>' |
| |
| default values include: |
| max_iter=100 |
| optimizer='irls' |
| tolerance=1e-6 |
| |
| verbose -- optional, default FALSE, whether to print debug info |
| ); |
| |
| ------------------------------------------------------------------ |
| OUTPUT |
| ------------------------------------------------------------------ |
| The output table ('out_table' above) has the following columns: |
| <...> -- grouping columns |
| 'coef' double precision[], -- vector of coefficients |
| 'log_likelihood' double precision, -- log likelihood |
| 'std_err' double precision[], -- vector of standard errors |
| 'z_stats'/'t_stats' double precision[], -- vector of z-statistics if family=Poisson or Binomial; vector of t-statistics otherwise |
| 'p_values' double precision[], -- vector of p-values |
| 'dispersion' double precision[], -- dispersion parameter (if z-stats is used, dispersion is set to be constant 1) |
| 'num_rows_processed' bigint, -- numbers of rows processed |
| 'num_rows_skipped' bigint, -- numbers of rows skipped |
| 'num_iterations' integer -- number of iterations run |
| |
| A summary table named <out_table>_summary is also created at the same time, which has: |
| method varchar, -- modeling method name: 'glm' |
| source_table varchar, -- the data source table name |
| model_table varchar, -- the output table name |
| dependent_varname varchar, -- the dependent variable |
| independent_varname varchar, -- the independent variable |
| family_params varchar, -- family distribution and link function |
| grouping_col varchar -- grouping columns used in the regression |
| optimizer_params varchar, -- 'optimizer=...,max_iter=...,tolerance=...' |
| num_all_groups integer, -- how many groups |
| num_failed_groups integer, -- how many groups' fitting processes failed |
| total_rows_processed bigint, -- total numbers of rows processed |
| total_rows_skipped bigint, -- total numbers of rows skipped |
| """ |
| else: |
| help_string = "No such option. Use {schema_madlib}.glm('help')" |
| |
| return help_string.format(schema_madlib=schema_madlib) |
| |
| |
| # ======================================================================== |
| |
| def glm_predict_help_msg(schema_madlib, message, **kwargs): |
| """ Help message for glm predict function |
| |
| @param message A string, the help message indicator |
| |
| Returns: |
| A string, contains the help message |
| """ |
| if not message: |
| |
| help_string = """ |
| ---------------------------------------------------------------- |
| SUMMARY |
| ---------------------------------------------------------------- |
| Prediction function for generalized linear regression: |
| |
| Estimate the conditional mean for the new predictors. The length of input |
| coefficients should match the number of variables in the new predictors. |
| |
| For details on function usage: |
| SELECT {schema_madlib}.glm_predict('usage') |
| |
| For prediction functions related to specific distributions: |
| SELECT {schema_madlib}.glm_predict_poisson('help') |
| SELECT {schema_madlib}.glm_predict_binomial('help') |
| """ |
| elif message in ['usage', 'help', '?']: |
| |
| help_string = """ |
| ------------------------------------------------------------------ |
| USAGE |
| ------------------------------------------------------------------ |
| SELECT {schema_madlib}.glm_predict( |
| coef, -- array of coefficients derived from glm() function |
| col_ind_var, -- array of independent variables for new predictors |
| link -- string indicating the link function specifid in glm() |
| ); |
| |
| ------------------------------------------------------------------ |
| OUTPUT |
| ------------------------------------------------------------------ |
| The output is a table with one column which gives the estimated conditional |
| means for the new predictors. |
| """ |
| else: |
| help_string = "No such option. Use {schema_madlib}.glm_predict('help')" |
| |
| return help_string.format(schema_madlib=schema_madlib) |
| |
| |
| # ============================================================================ |
| # Help messages for specialized prediction functions |
| # ============================================================================ |
| def glm_predict_poisson_help_msg(schema_madlib, message, **kwargs): |
| """ Help message for glm predict function |
| |
| @param message A string, the help message indicator |
| |
| Returns: |
| A string, contains the help message |
| """ |
| if message in ['usage', 'help', '?', '']: |
| |
| help_string = """ |
| ------------------------------------------------------------------ |
| USAGE |
| ------------------------------------------------------------------ |
| SELECT {schema_madlib}.glm_predict_poisson( |
| coef, -- array of coefficients derived from glm() function |
| col_ind_var, -- array of independent variables for new predictors |
| link -- string indicating the link function specifid in glm() |
| ); |
| |
| ------------------------------------------------------------------ |
| OUTPUT |
| ------------------------------------------------------------------ |
| The output is a table with one column which gives the estimated conditional |
| mean for the new predictors, rounded to the nearest integral value. |
| |
| For more details on glm predict functions: |
| SELECT {schema_madlib}.glm_predict('usage') |
| """ |
| else: |
| help_string = "No such option. Use {schema_madlib}.glm_predict_poisson('help')" |
| |
| return help_string.format(schema_madlib=schema_madlib) |
| |
| |
| def glm_predict_binomial_help_msg(schema_madlib, message, **kwargs): |
| """ Help message for glm predict function |
| |
| @param message A string, the help message indicator |
| |
| Returns: |
| A string, contains the help message |
| """ |
| if message in ['usage', 'help', '?', '']: |
| |
| help_string = """ |
| ------------------------------------------------------------------ |
| USAGE |
| ------------------------------------------------------------------ |
| SELECT {schema_madlib}.glm_predict_binomial( |
| coef, -- array of coefficients derived from glm() function |
| col_ind_var, -- array of independent variables for new predictors |
| link -- string indicating the link function specifid in glm() |
| ); |
| |
| ------------------------------------------------------------------ |
| OUTPUT |
| ------------------------------------------------------------------ |
| The output is a table with one column which gives the estimated output category |
| of the dependent variable as a boolean value. |
| |
| For more details on glm predict functions: |
| SELECT {schema_madlib}.glm_predict('usage') |
| """ |
| else: |
| help_string = "No such option. Use {schema_madlib}.glm_predict_binomial('help')" |
| |
| return help_string.format(schema_madlib=schema_madlib) |