src/ports/postgres/modules/glm/glm.py_in - madlib - Git at Google

 # coding=utf-8

 """
 @file glm.py_in

 @brief Generalized Linear Models: Driver functions

 @namespace glm

 @brief Generalized Linear Models: Driver functions
 """
 import plpy
 from utilities.in_mem_group_control import GroupIterationController
 from utilities.utilities import unique_string
 from utilities.validate_args import explicit_bool_to_text
 from utilities.utilities import _string_to_array
 from utilities.utilities import _string_to_array_with_quotes
 from utilities.utilities import extract_keyvalue_params
 from utilities.validate_args import input_tbl_valid
 from utilities.validate_args import output_tbl_valid
 from utilities.validate_args import cols_in_tbl_valid
 from utilities.utilities import add_postfix
 # ========================================================================


 def __compute_glm(arg_dict):
     """
     Compute Generalized Linear Model coefficients

     This method serves as an interface to different optimization algorithms.
     By default, iteratively reweighted least squares is used.

     @return Number of iterations that has been run
     """

     iterationCtrl = GroupIterationController(arg_dict)
     with iterationCtrl as it:
         it.iteration = 0
         while True:
             it.update(
                 """
                 {schema_madlib}.__glm_{family}_{link}_agg(
                     ({col_dep_var})::double precision,
                     ({col_ind_var})::double precision[],
                     {rel_state}.{col_grp_state})
                 """)
             if it.test(
                     """
                     {iteration} >= {max_iter}
                     OR {schema_madlib}.__glm_loglik_diff(
                         _state_previous, _state_current) < {tolerance}
                     """):
                 it.final()
                 break

     return iterationCtrl.iteration
 # ========================================================================


 def glm(schema_madlib, source_table, model_table, dependent_varname,
         independent_varname, family_params=None, grouping_col=None,
         optim_params=None, verbose=False, **kwargs):
     """
     Train Genralized Linear Model

     @param schema_madlib Name of the MADlib schema, properly escaped/quoted
     @param source_table Name of relation containing the training data
     @param model_table Name of relation where model will be outputted
     @param dependent_varname Name of dependent column in training data
     @param independent_varname Name of independent column in training data (of type
                    DOUBLE PRECISION[])
     @param family_params Distribution of dependent variable
     @param grouping_col String of comma delimited group-by columns
     @param optim_params Parameters for optimizer
     @param kwargs We allow the caller to specify additional arguments (all of
            which will be ignored though). The purpose of this is to allow the
            caller to unpack a dictionary whose element set is a superset of
            the required arguments by this function.

     @return A composite value which is __glm_result defined in glm.sql_in
     """
     __glm_validate_args(schema_madlib, source_table, model_table, dependent_varname,
                         independent_varname, grouping_col)

     family_params_dict = __extract_family_params(schema_madlib, family_params)
     optim_params_dict = __extract_optim_params(schema_madlib, optim_params)

     return __glm_compute(
         schema_madlib, source_table, model_table, dependent_varname,
         independent_varname, grouping_col, family_params_dict,
         optim_params_dict, verbose=verbose, **kwargs)
 # ========================================================================


 def __glm_validate_args(schema_madlib, tbl_source, tbl_output, col_dep_var,
                         col_ind_var, grouping_col):
     """
     Validate the arguments
     """
     input_tbl_valid(tbl_source, 'GLM')

     output_tbl_valid(tbl_output, 'GLM')

     if col_dep_var is None or col_dep_var.strip() == '':
         plpy.error("GLM error: Invalid dependent column name!")

     if col_ind_var is None or col_ind_var.strip() == '':
         plpy.error("GLM error: Invalid independent column name!")

     if grouping_col:
         cols_in_tbl_valid(tbl_source, _string_to_array_with_quotes(grouping_col), 'GLM')
         intersect = frozenset(_string_to_array(grouping_col)).intersection(
             frozenset(('coef', 'log_likelihood', 'std_err', 'z_stats',
                        'p_values', 'odds_ratios', 'condition_no',
                        'num_processed', 'num_missing_rows_skipped',
                        'variance_covariance', 'dispersion', 't_stats')))
         if len(intersect) > 0:
             plpy.error("GLM error: Conflicting grouping column name.\n"
                        "Predefined name(s) {0} are not allow!".format(
                            ', '.join(intersect)))

     return None
 # ========================================================================


 def __extract_family_params(schema_madlib, family_params):
     family_params_types = dict(family=str, link=str)
     family_params_dict = extract_keyvalue_params(family_params,
                                                  family_params_types)
     # we use the first element as the default link function for the family
     family_link = dict(
         poisson=["log", "identity", "sqrt"],
         gaussian=["identity", "log", "inverse"],
         gamma=["inverse", "log", "identity"],
         inverse_gaussian=["sqr_inverse", "identity", "log", "inverse"],
         binomial=["logit", "probit"])
     for k, v in family_params_dict.iteritems():
         if k == "family":
             if v not in family_link.keys():
                 plpy.error("GLM error: {param_value} is not a valid "
                            "family!".format(param_value=v))

     if "family" not in family_params_dict.keys():
         plpy.error("GLM error: Required parameter family is missing!")
     if "link" in family_params_dict.keys():
         if family_params_dict["link"] not in family_link[family_params_dict["family"]]:
             plpy.error("GLM error: Invalid link function {link_func} for "
                        "family {family}!".format(link_func=family_params_dict["link"],
                                                  family=family_params_dict["family"]))
     else:
         # default link function
         family_params_dict["link"] = family_link[family_params_dict["family"]][0]

     return family_params_dict
 # ========================================================================


 def __extract_optim_params(schema_madlib, optim_params, module='GLM'):
     default_dict = dict(max_iter=100, optimizer='irls', tolerance=1e-6)
     optim_params_types = dict(max_iter=int, optimizer=str, tolerance=float)
     optim_params_dict = extract_keyvalue_params(optim_params,
                                                 optim_params_types,
                                                 default_dict)

     if optim_params_dict['max_iter'] <= 0:
         plpy.error("{0} error: max_iter must be positive!".format(module))
     if optim_params_dict['optimizer'] != 'irls':
         plpy.error("{0} error: optimizer must be irls!".format(module))
     if optim_params_dict['tolerance'] <= 0:
         plpy.error("{0} error: tolerane must be positive!".format(module))

     return optim_params_dict
 # ========================================================================


 def __glm_compute(schema_madlib, tbl_source, tbl_output, col_dep_var, col_ind_var,
                   grouping_col, family_params, optim_params, verbose=False, **kwargs):
     """
     Create an output table (drop if exists) that contains the Generalized Linear Model
     """
     old_msg_level = plpy.execute("""
         SELECT setting FROM pg_settings
         WHERE name='client_min_messages'
         """)[0]['setting']
     if verbose:
         plpy.execute("SET client_min_messages TO info")
     else:
         plpy.execute("SET client_min_messages TO warning")

     args = {'schema_madlib': schema_madlib,
             'rel_source': tbl_source,
             'tbl_output': tbl_output,
             'col_dep_var': col_dep_var,
             'col_ind_var': col_ind_var,
             'rel_state': unique_string(),
             'col_grp_iteration': unique_string(),
             'col_grp_state': unique_string(),
             'state_type': schema_madlib + ".bytea8"
             }
     args.update(optim_params)
     args.update(family_params)

     # return an array of dict
     # each dict has two elements: iteration number, and grouping value array
     if grouping_col:
         grouping_list = explicit_bool_to_text(
             tbl_source, _string_to_array_with_quotes(grouping_col), schema_madlib)
         for i in range(len(grouping_list)):
             grouping_list[i] += "::text"
         grouping_str = ','.join(grouping_list)
     else:
         grouping_col = None
         grouping_str = "NULL"
     args['grouping_col'] = grouping_col
     args['grouping_str'] = grouping_str

     # for binomial distribution, the dependent variable is of type boolean.
     # it's cast to integer here so that it can later be type cast to
     # double precision before computation begins.
     if family_params['family'] == 'binomial':
         args['col_dep_var'] = "(" + col_dep_var + ")::integer"

     # REAL COMPUTATION
     iteration_run = __compute_glm(args)

     if iteration_run >= optim_params['max_iter']:
         plpy.warning("GLM warning: the computation did not converge in " +
                      str(optim_params['max_iter']) + " iterations!")

     # output table
     grouping_str1 = "" if grouping_col is None else grouping_col + ","
     grouping_str2 = "1 = 1" if grouping_col is None else grouping_col
     using_str = "" if grouping_str1 == "" else "using (" + grouping_col + ")"
     join_str = "," if grouping_str1 == "" else "join "

     if family_params['family'] in ['poisson', 'binomial']:
         res_str = """
             (result).z_stats AS z_stats,
             (result).p_values AS p_values,
             (result).dispersion AS dispersion
             """
         glm_result = "__glm_result_z_stats"
     else:
         res_str = """
             (result).z_stats AS t_stats,
             (result).p_values AS p_values,
             (result).dispersion AS dispersion
             """
         glm_result = "__glm_result_t_stats"

     plpy.execute(
         """
         CREATE TABLE {tbl_output} AS
         SELECT
             {grouping_str1}
             (result).coef AS coef,
             (result).loglik AS log_likelihood,
             (result).std_err AS std_err,
             {res_str},
            (CASE WHEN result IS NULL THEN 0
              ELSE (result).num_rows_processed
              END)::bigint AS num_rows_processed,
             (CASE WHEN result IS NULL THEN num_rows
              ELSE num_rows - (result).num_rows_processed
              END)::bigint AS num_rows_skipped,
             {col_grp_iteration}::integer AS num_iterations
         FROM (
             SELECT
                 {col_grp_iteration}, {grouping_str1} result, num_rows
             FROM (
                 ( SELECT
                         {grouping_str1}
                         {schema_madlib}.{glm_result}({col_grp_state}) AS result,
                         {col_grp_iteration}
                   FROM
                         {rel_state}
                 ) t
                 JOIN
                 ( SELECT
                         {grouping_str1}
                         max({col_grp_iteration}) AS {col_grp_iteration}
                   FROM {rel_state}
                   GROUP BY {grouping_str2}
                 ) s
                 USING ({grouping_str1} {col_grp_iteration})
             ) q1
             {join_str}
             ( SELECT
                     {grouping_str1}
                     count(*) AS num_rows
               FROM {rel_source}
               GROUP BY {grouping_str2}
             ) q2
             {using_str}
         ) q3
         """.format(grouping_str1=grouping_str1,
                    grouping_str2=grouping_str2,
                    iteration_run=iteration_run,
                    using_str=using_str,
                    join_str=join_str,
                    res_str=res_str,
                    glm_result=glm_result,
                    **args))

     # summary table
     failed_groups = plpy.execute("""
         SELECT count(*) AS num_failed_groups
         FROM {tbl_output}
         WHERE coef IS NULL
         """.format(**args))[0]
     all_groups = plpy.execute("""
         SELECT count(*) AS num_all_groups
         FROM {tbl_output}
         """.format(**args))[0]
     total_rows = plpy.execute("""
         SELECT
             sum(num_rows_processed) AS total_rows_processed,
             sum(num_rows_skipped) AS total_rows_skipped
         FROM {tbl_output}
         """.format(tbl_output=tbl_output))[0]

     args.update(failed_groups)
     args.update(all_groups)
     args.update(total_rows)

     tbl_output_summary = add_postfix(tbl_output, "_summary")
     plpy.execute("""
         CREATE TABLE {tbl_output_summary} AS
         SELECT
             'glm'::varchar                      AS method,
             '{rel_source}'::varchar             AS source_table,
             '{tbl_output}'::varchar             AS out_table,
             $madlib_super_quote${dcol}$madlib_super_quote$::varchar
                                                 AS dependent_varname,
             $madlib_super_quote${col_ind_var}$madlib_super_quote$::varchar
                                                 AS independent_varname,
             'family={family}, ' ||
             'link={link}'::varchar              AS family_params,
             {g_str}::text                       AS grouping_col,
             'optimizer={optimizer}, ' ||
             'max_iter={max_iter}, '   ||
             'tolerance={tolerance}'::varchar    AS optimizer_params,
             {num_all_groups}::integer           AS num_all_groups,
             {num_failed_groups}::integer        AS num_failed_groups,
             {total_rows_processed}::bigint      AS total_rows_processed,
             {total_rows_skipped}::bigint        AS total_rows_skipped
         """.format(g_str="'" + grouping_col + "'" if grouping_col else "NULL",
                    tbl_output_summary=tbl_output_summary,
                    dcol=col_dep_var,
                    **args))

     # clean up
     plpy.execute("""DROP TABLE IF EXISTS pg_temp.{rel_state} """.format(**args))
     plpy.execute("SET client_min_messages TO " + old_msg_level)
     return None


 # ========================================================================

 def glm_help_msg(schema_madlib, message, **kwargs):
     """ Help message for generalized linear regression model

     @param message A string, the help message indicator

     Returns:
       A string, contains the help message
     """
     if not message:

         help_string = """
 ------------------------------------------------------------------
                         SUMMARY
 ------------------------------------------------------------------
 Generalized Linear Model:

 Function to fit a generalized linear model, relating responses to linear combinations
 of predictor variables.

 For details on function usage:
         """
     elif message in ['usage', 'help', '?']:

         help_string = """
 ------------------------------------------------------------------
                         USAGE
 ------------------------------------------------------------------
 SELECT {schema_madlib}.glm(
     source_table,        -- name of input table
     model_table,         -- name of model table
     dependent_varname,   -- name of dependent variable
     independent_varname, -- names of independent variables
     family_params,       -- parameters for family distribution and link function

                             usage:
                             'family=<family_name>,link=<link_function_name>'

                             supported values include:
                             'family=poisson,link=identity|log|sqrt' (default link: log)
                             'family=binomial,link=logit|probit' (default link: logit)
                             'family=gaussian,link=identity|log|inverse' (default link: identity)
                             'family=inverse_gaussian,link=identity|log|inverse|sqr_inverse' (default link: sqr_inverse i.e. 1/mu^2)
                             'family=gamma,link=identity|log|inverse' (default link: inverse)

     grouping_col,        -- optional, default NULL, names of columns to group-by
     optimizer_params,    -- optional, parameters for optimizer

                             usage:
                             'max_iter=<max_num_iterations>,optimizer=<optimizer_name>,tolerance=<tolerance_value>'

                             default values include:
                             max_iter=100
                             optimizer='irls'
                             tolerance=1e-6

     verbose              -- optional, default FALSE, whether to print debug info
 );

 ------------------------------------------------------------------
                         OUTPUT
 ------------------------------------------------------------------
 The output table ('out_table' above) has the following columns:
     <...>                                    -- grouping columns
     'coef'               double precision[], -- vector of coefficients
     'log_likelihood'     double precision,   -- log likelihood
     'std_err'            double precision[], -- vector of standard errors
     'z_stats'/'t_stats'  double precision[], -- vector of z-statistics if family=Poisson or Binomial; vector of t-statistics otherwise
     'p_values'           double precision[], -- vector of p-values
     'dispersion'         double precision[], -- dispersion parameter (if z-stats is used, dispersion is set to be constant 1)
     'num_rows_processed' bigint,             -- numbers of rows processed
     'num_rows_skipped'   bigint,             -- numbers of rows skipped
     'num_iterations'     integer             -- number of iterations run

 A summary table named <out_table>_summary is also created at the same time, which has:
     method               varchar, -- modeling method name: 'glm'
     source_table         varchar, -- the data source table name
     model_table          varchar, -- the output table name
     dependent_varname    varchar, -- the dependent variable
     independent_varname  varchar, -- the independent variable
     family_params        varchar, -- family distribution and link function
     grouping_col         varchar  -- grouping columns used in the regression
     optimizer_params     varchar, -- 'optimizer=...,max_iter=...,tolerance=...'
     num_all_groups       integer, -- how many groups
     num_failed_groups    integer, -- how many groups' fitting processes failed
     total_rows_processed bigint,  -- total numbers of rows processed
     total_rows_skipped   bigint,  -- total numbers of rows skipped
         """
     else:
         help_string = "No such option. Use {schema_madlib}.glm('help')"

     return help_string.format(schema_madlib=schema_madlib)


 # ========================================================================

 def glm_predict_help_msg(schema_madlib, message, **kwargs):
     """ Help message for glm predict function

     @param message A string, the help message indicator

     Returns:
       A string, contains the help message
     """
     if not message:

         help_string = """
 ----------------------------------------------------------------
                         SUMMARY
 ----------------------------------------------------------------
 Prediction function for generalized linear regression:

 Estimate the conditional mean for the new predictors. The length of input
 coefficients should match the number of variables in the new predictors.

 For details on function usage:
     SELECT {schema_madlib}.glm_predict('usage')

 For prediction functions related to specific distributions:
     SELECT {schema_madlib}.glm_predict_poisson('help')
     SELECT {schema_madlib}.glm_predict_binomial('help')
         """
     elif message in ['usage', 'help', '?']:

         help_string = """
 ------------------------------------------------------------------
                         USAGE
 ------------------------------------------------------------------
 SELECT {schema_madlib}.glm_predict(
     coef,            -- array of coefficients derived from glm() function
     col_ind_var,     -- array of independent variables for new predictors
     link             -- string indicating the link function specifid in glm()
 );

 ------------------------------------------------------------------
                         OUTPUT
 ------------------------------------------------------------------
 The output is a table with one column which gives the estimated conditional
 means for the new predictors.
         """
     else:
         help_string = "No such option. Use {schema_madlib}.glm_predict('help')"

     return help_string.format(schema_madlib=schema_madlib)


 # ============================================================================
 # Help messages for specialized prediction functions
 # ============================================================================
 def glm_predict_poisson_help_msg(schema_madlib, message, **kwargs):
     """ Help message for glm predict function

     @param message A string, the help message indicator

     Returns:
       A string, contains the help message
     """
     if message in ['usage', 'help', '?', '']:

         help_string = """
 ------------------------------------------------------------------
                         USAGE
 ------------------------------------------------------------------
 SELECT {schema_madlib}.glm_predict_poisson(
     coef,            -- array of coefficients derived from glm() function
     col_ind_var,     -- array of independent variables for new predictors
     link             -- string indicating the link function specifid in glm()
 );

 ------------------------------------------------------------------
                         OUTPUT
 ------------------------------------------------------------------
 The output is a table with one column which gives the estimated conditional
 mean for the new predictors, rounded to the nearest integral value.

 For more details on glm predict functions:
     SELECT {schema_madlib}.glm_predict('usage')
     """
     else:
         help_string = "No such option. Use {schema_madlib}.glm_predict_poisson('help')"

     return help_string.format(schema_madlib=schema_madlib)


 def glm_predict_binomial_help_msg(schema_madlib, message, **kwargs):
     """ Help message for glm predict function

     @param message A string, the help message indicator

     Returns:
       A string, contains the help message
     """
     if message in ['usage', 'help', '?', '']:

         help_string = """
 ------------------------------------------------------------------
                         USAGE
 ------------------------------------------------------------------
 SELECT {schema_madlib}.glm_predict_binomial(
     coef,            -- array of coefficients derived from glm() function
     col_ind_var,     -- array of independent variables for new predictors
     link             -- string indicating the link function specifid in glm()
 );

 ------------------------------------------------------------------
                         OUTPUT
 ------------------------------------------------------------------
 The output is a table with one column which gives the estimated output category
 of the dependent variable as a boolean value.

 For more details on glm predict functions:
     SELECT {schema_madlib}.glm_predict('usage')
     """
     else:
         help_string = "No such option. Use {schema_madlib}.glm_predict_binomial('help')"

     return help_string.format(schema_madlib=schema_madlib)
	# coding=utf-8

	"""
	@file glm.py_in

	@brief Generalized Linear Models: Driver functions

	@namespace glm

	@brief Generalized Linear Models: Driver functions
	"""
	import plpy
	from utilities.in_mem_group_control import GroupIterationController
	from utilities.utilities import unique_string
	from utilities.validate_args import explicit_bool_to_text
	from utilities.utilities import _string_to_array
	from utilities.utilities import _string_to_array_with_quotes
	from utilities.utilities import extract_keyvalue_params
	from utilities.validate_args import input_tbl_valid
	from utilities.validate_args import output_tbl_valid
	from utilities.validate_args import cols_in_tbl_valid
	from utilities.utilities import add_postfix
	# ========================================================================


	def __compute_glm(arg_dict):
	"""
	Compute Generalized Linear Model coefficients

	This method serves as an interface to different optimization algorithms.
	By default, iteratively reweighted least squares is used.

	@return Number of iterations that has been run
	"""

	iterationCtrl = GroupIterationController(arg_dict)
	with iterationCtrl as it:
	it.iteration = 0
	while True:
	it.update(
	"""
	{schema_madlib}.__glm_{family}_{link}_agg(
	({col_dep_var})::double precision,
	({col_ind_var})::double precision[],
	{rel_state}.{col_grp_state})
	""")
	if it.test(
	"""
	{iteration} >= {max_iter}
	OR {schema_madlib}.__glm_loglik_diff(
	_state_previous, _state_current) < {tolerance}
	"""):
	it.final()
	break

	return iterationCtrl.iteration
	# ========================================================================


	def glm(schema_madlib, source_table, model_table, dependent_varname,
	independent_varname, family_params=None, grouping_col=None,
	optim_params=None, verbose=False, **kwargs):
	"""
	Train Genralized Linear Model

	@param schema_madlib Name of the MADlib schema, properly escaped/quoted
	@param source_table Name of relation containing the training data
	@param model_table Name of relation where model will be outputted
	@param dependent_varname Name of dependent column in training data
	@param independent_varname Name of independent column in training data (of type
	DOUBLE PRECISION[])
	@param family_params Distribution of dependent variable
	@param grouping_col String of comma delimited group-by columns
	@param optim_params Parameters for optimizer
	@param kwargs We allow the caller to specify additional arguments (all of
	which will be ignored though). The purpose of this is to allow the
	caller to unpack a dictionary whose element set is a superset of
	the required arguments by this function.

	@return A composite value which is __glm_result defined in glm.sql_in
	"""
	__glm_validate_args(schema_madlib, source_table, model_table, dependent_varname,
	independent_varname, grouping_col)

	family_params_dict = __extract_family_params(schema_madlib, family_params)
	optim_params_dict = __extract_optim_params(schema_madlib, optim_params)

	return __glm_compute(
	schema_madlib, source_table, model_table, dependent_varname,
	independent_varname, grouping_col, family_params_dict,
	optim_params_dict, verbose=verbose, **kwargs)
	# ========================================================================


	def __glm_validate_args(schema_madlib, tbl_source, tbl_output, col_dep_var,
	col_ind_var, grouping_col):
	"""
	Validate the arguments
	"""
	input_tbl_valid(tbl_source, 'GLM')

	output_tbl_valid(tbl_output, 'GLM')

	if col_dep_var is None or col_dep_var.strip() == '':
	plpy.error("GLM error: Invalid dependent column name!")

	if col_ind_var is None or col_ind_var.strip() == '':
	plpy.error("GLM error: Invalid independent column name!")

	if grouping_col:
	cols_in_tbl_valid(tbl_source, _string_to_array_with_quotes(grouping_col), 'GLM')
	intersect = frozenset(_string_to_array(grouping_col)).intersection(
	frozenset(('coef', 'log_likelihood', 'std_err', 'z_stats',
	'p_values', 'odds_ratios', 'condition_no',
	'num_processed', 'num_missing_rows_skipped',
	'variance_covariance', 'dispersion', 't_stats')))
	if len(intersect) > 0:
	plpy.error("GLM error: Conflicting grouping column name.\n"
	"Predefined name(s) {0} are not allow!".format(
	', '.join(intersect)))

	return None
	# ========================================================================


	def __extract_family_params(schema_madlib, family_params):
	family_params_types = dict(family=str, link=str)
	family_params_dict = extract_keyvalue_params(family_params,
	family_params_types)
	# we use the first element as the default link function for the family
	family_link = dict(
	poisson=["log", "identity", "sqrt"],
	gaussian=["identity", "log", "inverse"],
	gamma=["inverse", "log", "identity"],
	inverse_gaussian=["sqr_inverse", "identity", "log", "inverse"],
	binomial=["logit", "probit"])
	for k, v in family_params_dict.iteritems():
	if k == "family":
	if v not in family_link.keys():
	plpy.error("GLM error: {param_value} is not a valid "
	"family!".format(param_value=v))

	if "family" not in family_params_dict.keys():
	plpy.error("GLM error: Required parameter family is missing!")
	if "link" in family_params_dict.keys():
	if family_params_dict["link"] not in family_link[family_params_dict["family"]]:
	plpy.error("GLM error: Invalid link function {link_func} for "
	"family {family}!".format(link_func=family_params_dict["link"],
	family=family_params_dict["family"]))
	else:
	# default link function
	family_params_dict["link"] = family_link[family_params_dict["family"]][0]

	return family_params_dict
	# ========================================================================


	def __extract_optim_params(schema_madlib, optim_params, module='GLM'):
	default_dict = dict(max_iter=100, optimizer='irls', tolerance=1e-6)
	optim_params_types = dict(max_iter=int, optimizer=str, tolerance=float)
	optim_params_dict = extract_keyvalue_params(optim_params,
	optim_params_types,
	default_dict)

	if optim_params_dict['max_iter'] <= 0:
	plpy.error("{0} error: max_iter must be positive!".format(module))
	if optim_params_dict['optimizer'] != 'irls':
	plpy.error("{0} error: optimizer must be irls!".format(module))
	if optim_params_dict['tolerance'] <= 0:
	plpy.error("{0} error: tolerane must be positive!".format(module))

	return optim_params_dict
	# ========================================================================


	def __glm_compute(schema_madlib, tbl_source, tbl_output, col_dep_var, col_ind_var,
	grouping_col, family_params, optim_params, verbose=False, **kwargs):
	"""
	Create an output table (drop if exists) that contains the Generalized Linear Model
	"""
	old_msg_level = plpy.execute("""
	SELECT setting FROM pg_settings
	WHERE name='client_min_messages'
	""")[0]['setting']
	if verbose:
	plpy.execute("SET client_min_messages TO info")
	else:
	plpy.execute("SET client_min_messages TO warning")

	args = {'schema_madlib': schema_madlib,
	'rel_source': tbl_source,
	'tbl_output': tbl_output,
	'col_dep_var': col_dep_var,
	'col_ind_var': col_ind_var,
	'rel_state': unique_string(),
	'col_grp_iteration': unique_string(),
	'col_grp_state': unique_string(),
	'state_type': schema_madlib + ".bytea8"
	}
	args.update(optim_params)
	args.update(family_params)

	# return an array of dict
	# each dict has two elements: iteration number, and grouping value array
	if grouping_col:
	grouping_list = explicit_bool_to_text(
	tbl_source, _string_to_array_with_quotes(grouping_col), schema_madlib)
	for i in range(len(grouping_list)):
	grouping_list[i] += "::text"
	grouping_str = ','.join(grouping_list)
	else:
	grouping_col = None
	grouping_str = "NULL"
	args['grouping_col'] = grouping_col
	args['grouping_str'] = grouping_str

	# for binomial distribution, the dependent variable is of type boolean.
	# it's cast to integer here so that it can later be type cast to
	# double precision before computation begins.
	if family_params['family'] == 'binomial':
	args['col_dep_var'] = "(" + col_dep_var + ")::integer"

	# REAL COMPUTATION
	iteration_run = __compute_glm(args)

	if iteration_run >= optim_params['max_iter']:
	plpy.warning("GLM warning: the computation did not converge in " +
	str(optim_params['max_iter']) + " iterations!")

	# output table
	grouping_str1 = "" if grouping_col is None else grouping_col + ","
	grouping_str2 = "1 = 1" if grouping_col is None else grouping_col
	using_str = "" if grouping_str1 == "" else "using (" + grouping_col + ")"
	join_str = "," if grouping_str1 == "" else "join "

	if family_params['family'] in ['poisson', 'binomial']:
	res_str = """
	(result).z_stats AS z_stats,
	(result).p_values AS p_values,
	(result).dispersion AS dispersion
	"""
	glm_result = "__glm_result_z_stats"
	else:
	res_str = """
	(result).z_stats AS t_stats,
	(result).p_values AS p_values,
	(result).dispersion AS dispersion
	"""
	glm_result = "__glm_result_t_stats"

	plpy.execute(
	"""
	CREATE TABLE {tbl_output} AS
	SELECT
	{grouping_str1}
	(result).coef AS coef,
	(result).loglik AS log_likelihood,
	(result).std_err AS std_err,
	{res_str},
	(CASE WHEN result IS NULL THEN 0
	ELSE (result).num_rows_processed
	END)::bigint AS num_rows_processed,
	(CASE WHEN result IS NULL THEN num_rows
	ELSE num_rows - (result).num_rows_processed
	END)::bigint AS num_rows_skipped,
	{col_grp_iteration}::integer AS num_iterations
	FROM (
	SELECT
	{col_grp_iteration}, {grouping_str1} result, num_rows
	FROM (
	( SELECT
	{grouping_str1}
	{schema_madlib}.{glm_result}({col_grp_state}) AS result,
	{col_grp_iteration}
	FROM
	{rel_state}
	) t
	JOIN
	( SELECT
	{grouping_str1}
	max({col_grp_iteration}) AS {col_grp_iteration}
	FROM {rel_state}
	GROUP BY {grouping_str2}
	) s
	USING ({grouping_str1} {col_grp_iteration})
	) q1
	{join_str}
	( SELECT
	{grouping_str1}
	count(*) AS num_rows
	FROM {rel_source}
	GROUP BY {grouping_str2}
	) q2
	{using_str}
	) q3
	""".format(grouping_str1=grouping_str1,
	grouping_str2=grouping_str2,
	iteration_run=iteration_run,
	using_str=using_str,
	join_str=join_str,
	res_str=res_str,
	glm_result=glm_result,
	**args))

	# summary table
	failed_groups = plpy.execute("""
	SELECT count(*) AS num_failed_groups
	FROM {tbl_output}
	WHERE coef IS NULL
	""".format(**args))[0]
	all_groups = plpy.execute("""
	SELECT count(*) AS num_all_groups
	FROM {tbl_output}
	""".format(**args))[0]
	total_rows = plpy.execute("""
	SELECT
	sum(num_rows_processed) AS total_rows_processed,
	sum(num_rows_skipped) AS total_rows_skipped
	FROM {tbl_output}
	""".format(tbl_output=tbl_output))[0]

	args.update(failed_groups)
	args.update(all_groups)
	args.update(total_rows)

	tbl_output_summary = add_postfix(tbl_output, "_summary")
	plpy.execute("""
	CREATE TABLE {tbl_output_summary} AS
	SELECT
	'glm'::varchar AS method,
	'{rel_source}'::varchar AS source_table,
	'{tbl_output}'::varchar AS out_table,
	$madlib_super_quote${dcol}$madlib_super_quote$::varchar
	AS dependent_varname,
	$madlib_super_quote${col_ind_var}$madlib_super_quote$::varchar
	AS independent_varname,
	'family={family}, ' \|\|
	'link={link}'::varchar AS family_params,
	{g_str}::text AS grouping_col,
	'optimizer={optimizer}, ' \|\|
	'max_iter={max_iter}, ' \|\|
	'tolerance={tolerance}'::varchar AS optimizer_params,
	{num_all_groups}::integer AS num_all_groups,
	{num_failed_groups}::integer AS num_failed_groups,
	{total_rows_processed}::bigint AS total_rows_processed,
	{total_rows_skipped}::bigint AS total_rows_skipped
	""".format(g_str="'" + grouping_col + "'" if grouping_col else "NULL",
	tbl_output_summary=tbl_output_summary,
	dcol=col_dep_var,
	**args))

	# clean up
	plpy.execute("""DROP TABLE IF EXISTS pg_temp.{rel_state} """.format(**args))
	plpy.execute("SET client_min_messages TO " + old_msg_level)
	return None


	# ========================================================================

	def glm_help_msg(schema_madlib, message, **kwargs):
	""" Help message for generalized linear regression model

	@param message A string, the help message indicator

	Returns:
	A string, contains the help message
	"""
	if not message:

	help_string = """
	------------------------------------------------------------------
	SUMMARY
	------------------------------------------------------------------
	Generalized Linear Model:

	Function to fit a generalized linear model, relating responses to linear combinations
	of predictor variables.

	For details on function usage:
	"""
	elif message in ['usage', 'help', '?']:

	help_string = """
	------------------------------------------------------------------
	USAGE
	------------------------------------------------------------------
	SELECT {schema_madlib}.glm(
	source_table, -- name of input table
	model_table, -- name of model table
	dependent_varname, -- name of dependent variable
	independent_varname, -- names of independent variables
	family_params, -- parameters for family distribution and link function

	usage:
	'family=<family_name>,link=<link_function_name>'

	supported values include:
	'family=poisson,link=identity\|log\|sqrt' (default link: log)
	'family=binomial,link=logit\|probit' (default link: logit)
	'family=gaussian,link=identity\|log\|inverse' (default link: identity)
	'family=inverse_gaussian,link=identity\|log\|inverse\|sqr_inverse' (default link: sqr_inverse i.e. 1/mu^2)
	'family=gamma,link=identity\|log\|inverse' (default link: inverse)

	grouping_col, -- optional, default NULL, names of columns to group-by
	optimizer_params, -- optional, parameters for optimizer

	usage:
	'max_iter=<max_num_iterations>,optimizer=<optimizer_name>,tolerance=<tolerance_value>'

	default values include:
	max_iter=100
	optimizer='irls'
	tolerance=1e-6

	verbose -- optional, default FALSE, whether to print debug info
	);

	------------------------------------------------------------------
	OUTPUT
	------------------------------------------------------------------
	The output table ('out_table' above) has the following columns:
	<...> -- grouping columns
	'coef' double precision[], -- vector of coefficients
	'log_likelihood' double precision, -- log likelihood
	'std_err' double precision[], -- vector of standard errors
	'z_stats'/'t_stats' double precision[], -- vector of z-statistics if family=Poisson or Binomial; vector of t-statistics otherwise
	'p_values' double precision[], -- vector of p-values
	'dispersion' double precision[], -- dispersion parameter (if z-stats is used, dispersion is set to be constant 1)
	'num_rows_processed' bigint, -- numbers of rows processed
	'num_rows_skipped' bigint, -- numbers of rows skipped
	'num_iterations' integer -- number of iterations run

	A summary table named <out_table>_summary is also created at the same time, which has:
	method varchar, -- modeling method name: 'glm'
	source_table varchar, -- the data source table name
	model_table varchar, -- the output table name
	dependent_varname varchar, -- the dependent variable
	independent_varname varchar, -- the independent variable
	family_params varchar, -- family distribution and link function
	grouping_col varchar -- grouping columns used in the regression
	optimizer_params varchar, -- 'optimizer=...,max_iter=...,tolerance=...'
	num_all_groups integer, -- how many groups
	num_failed_groups integer, -- how many groups' fitting processes failed
	total_rows_processed bigint, -- total numbers of rows processed
	total_rows_skipped bigint, -- total numbers of rows skipped
	"""
	else:
	help_string = "No such option. Use {schema_madlib}.glm('help')"

	return help_string.format(schema_madlib=schema_madlib)


	# ========================================================================

	def glm_predict_help_msg(schema_madlib, message, **kwargs):
	""" Help message for glm predict function

	@param message A string, the help message indicator

	Returns:
	A string, contains the help message
	"""
	if not message:

	help_string = """
	----------------------------------------------------------------
	SUMMARY
	----------------------------------------------------------------
	Prediction function for generalized linear regression:

	Estimate the conditional mean for the new predictors. The length of input
	coefficients should match the number of variables in the new predictors.

	For details on function usage:
	SELECT {schema_madlib}.glm_predict('usage')

	For prediction functions related to specific distributions:
	SELECT {schema_madlib}.glm_predict_poisson('help')
	SELECT {schema_madlib}.glm_predict_binomial('help')
	"""
	elif message in ['usage', 'help', '?']:

	help_string = """
	------------------------------------------------------------------
	USAGE
	------------------------------------------------------------------
	SELECT {schema_madlib}.glm_predict(
	coef, -- array of coefficients derived from glm() function
	col_ind_var, -- array of independent variables for new predictors
	link -- string indicating the link function specifid in glm()
	);

	------------------------------------------------------------------
	OUTPUT
	------------------------------------------------------------------
	The output is a table with one column which gives the estimated conditional
	means for the new predictors.
	"""
	else:
	help_string = "No such option. Use {schema_madlib}.glm_predict('help')"

	return help_string.format(schema_madlib=schema_madlib)


	# ============================================================================
	# Help messages for specialized prediction functions
	# ============================================================================
	def glm_predict_poisson_help_msg(schema_madlib, message, **kwargs):
	""" Help message for glm predict function

	@param message A string, the help message indicator

	Returns:
	A string, contains the help message
	"""
	if message in ['usage', 'help', '?', '']:

	help_string = """
	------------------------------------------------------------------
	USAGE
	------------------------------------------------------------------
	SELECT {schema_madlib}.glm_predict_poisson(
	coef, -- array of coefficients derived from glm() function
	col_ind_var, -- array of independent variables for new predictors
	link -- string indicating the link function specifid in glm()
	);

	------------------------------------------------------------------
	OUTPUT
	------------------------------------------------------------------
	The output is a table with one column which gives the estimated conditional
	mean for the new predictors, rounded to the nearest integral value.

	For more details on glm predict functions:
	SELECT {schema_madlib}.glm_predict('usage')
	"""
	else:
	help_string = "No such option. Use {schema_madlib}.glm_predict_poisson('help')"

	return help_string.format(schema_madlib=schema_madlib)


	def glm_predict_binomial_help_msg(schema_madlib, message, **kwargs):
	""" Help message for glm predict function

	@param message A string, the help message indicator

	Returns:
	A string, contains the help message
	"""
	if message in ['usage', 'help', '?', '']:

	help_string = """
	------------------------------------------------------------------
	USAGE
	------------------------------------------------------------------
	SELECT {schema_madlib}.glm_predict_binomial(
	coef, -- array of coefficients derived from glm() function
	col_ind_var, -- array of independent variables for new predictors
	link -- string indicating the link function specifid in glm()
	);

	------------------------------------------------------------------
	OUTPUT
	------------------------------------------------------------------
	The output is a table with one column which gives the estimated output category
	of the dependent variable as a boolean value.

	For more details on glm predict functions:
	SELECT {schema_madlib}.glm_predict('usage')
	"""
	else:
	help_string = "No such option. Use {schema_madlib}.glm_predict_binomial('help')"

	return help_string.format(schema_madlib=schema_madlib)