| # coding=utf-8 |
| |
| """ |
| @file marginal.py_in |
| |
| @brief Marginal Effects: Common functions |
| |
| @namespace marginal |
| |
| @brief Marginal effects: Common functions |
| """ |
| import plpy |
| import re |
| from utilities.utilities import __unique_string |
| from utilities.utilities import _string_to_array |
| from utilities.validate_args import table_exists |
| from utilities.validate_args import columns_exist_in_table |
| from utilities.validate_args import table_is_empty |
| from utilities.validate_args import scalar_col_has_no_null |
| from utilities.validate_args import get_cols |
| |
| # use mad_vec to process arrays passed as strings in GPDB < 4.1 and PG < 9.0 |
| from utilities.utilities import __mad_version |
| version_wrapper = __mad_version() |
| string_to_array = version_wrapper.select_vecfunc() |
| array_to_string = version_wrapper.select_vec_return() |
| |
| # ======================================================================== |
| |
| def _margins_logregr_validate_args( |
| schema_madlib, |
| source_table, |
| out_table, |
| dependent_varname, |
| independent_varname, |
| input_group_cols, |
| marginal_vars, |
| max_iter, |
| optimizer, |
| tolerance): |
| """ |
| Validate the arguments |
| """ |
| if not source_table or source_table in ('null', '') or \ |
| (not table_exists(source_table)): |
| plpy.error("Margins error: Data table does not exist!") |
| |
| if table_is_empty(source_table): |
| plpy.error("Margins error: Data table is empty!") |
| |
| if out_table.lower() in ('null', ''): |
| plpy.error("Margins error: Invalid output table name!") |
| |
| if table_exists(out_table): |
| plpy.error("Margins error: Output table already exists!") |
| |
| if not dependent_varname or dependent_varname.lower() in ('null', ''): |
| plpy.error("Margins error: Invalid dependent column name!") |
| |
| if not scalar_col_has_no_null(source_table, dependent_varname): |
| plpy.error("Margins error: Dependent variable has Null values! \ |
| Please filter out Null values before using this function!") |
| |
| if not independent_varname or independent_varname.lower() in ('null', ''): |
| plpy.error("Margins error: Invalid independent column name!") |
| |
| if input_group_cols and input_group_cols.lower() in ('null', ''): |
| plpy.error("Margins error: Invalid grouping columns name!") |
| |
| if max_iter <= 0: |
| plpy.error("Margins error: Maximum number of iterations must be positive!") |
| |
| if tolerance < 0: |
| plpy.error("Margins error: The tolerance cannot be negative!") |
| |
| if optimizer is None: |
| optimizer = "irls" |
| elif optimizer not in ("irls", "cg", "igd"): |
| plpy.error(""" Margins error: Optimizer does not exist. |
| Must be 'newton'/'irls', 'cg', or 'igd'. |
| """) |
| |
| if optimizer == "newton": |
| optimizer = "irls" |
| elif optimizer not in ("irls", "cg", "igd"): |
| plpy.error(""" Margins error: Optimizer %s does not exist. |
| Must be 'newton'/'irls', 'cg', or 'igd'. |
| """ % optimizer) |
| if input_group_cols: |
| if not columns_exist_in_table(source_table, |
| _string_to_array(input_group_cols), schema_madlib): |
| plpy.error("Margins error: Grouping column does not exist!") |
| |
| |
| # ======================================================================== |
| # Returns relevant ouputs |
| # Note: The marginal_vars index list is 1 base |
| def _internal_get_margins_from_array(marginal_coef, marginal_vars): |
| |
| if min(marginal_vars) < 1: |
| plpy.error('''Margins error: All indices in the marginal coefficients array must be >= 1.''') |
| if max(marginal_vars) > len(marginal_coef): |
| plpy.error('''Margins error: All indices in the marginal coefficients array must be <= the number of independent variables.''') |
| |
| return [marginal_coef[i-1] for i in marginal_vars] |
| |
| |
| # ======================================================================== |
| # Convert numeric array to SQL string |
| def _internal_py_array_to_sql_string(array): |
| return "ARRAY[%s]" % ','.join(map(str, array)) |
| |
| |
| # ======================================================================== |
| # Convert numeric array to SQL string |
| def _internal_return_all_except_dep_vars(schema_madlib, |
| source_table, |
| dependent_varname): |
| cols = get_cols(source_table, schema_madlib) |
| outstr_array = [] |
| for i in range(len(cols)): |
| if cols[i] != dependent_varname: |
| outstr_array.append(cols[i]) |
| |
| return "ARRAY[%s]" % ','.join(outstr_array) |
| |
| # ======================================================================== |
| # Main function call for marginal logisitc regression |
| def margins_logregr(schema_madlib, source_table, out_table |
| , dependent_varname, input_independent_varname |
| , input_group_cols, marginal_vars |
| , max_iter, optimizer, tolerance , **kwargs): |
| """ |
| @brief A wrapper function for the various marginal regression analyzes. |
| |
| @param source_table String identifying the input table |
| @param out_table String identifying the output table to be created |
| @param dependent_varname Column containing the dependent variable |
| @param independent_varname Column containing the array of independent variables |
| @param grouping_cols Set of columns to group by. |
| @param marginal_vars Subset of independent variables to calculate marginal effects for. |
| @param max_iter Maximum number of iterations |
| @param optimzer Optimizer to be used (newton/irls, cg or idg) |
| @param tolerance Resiual tolerance |
| |
| @par |
| To include an intercept in the model, set one coordinate in the |
| <tt>independentVariables</tt> array to 1. |
| |
| @return void |
| |
| @usage |
| For function summary information. Run |
| sql> select margins_logregr('help'); |
| OR |
| sql> select margins_logregr(); |
| OR |
| sql> select margins_logregr('?'); |
| For function usage information. Run |
| sql> select margins_logregr('usage'); |
| """ |
| # Reset the message level to avoid random messages |
| old_msg_level = plpy.execute(""" |
| SELECT setting |
| FROM pg_settings |
| WHERE name='client_min_messages' |
| """)[0]['setting'] |
| plpy.execute('SET client_min_messages TO warning') |
| |
| # Validate arguments |
| _margins_logregr_validate_args(schema_madlib, |
| source_table, |
| out_table, |
| dependent_varname, |
| input_independent_varname, |
| input_group_cols, |
| marginal_vars, |
| max_iter, |
| optimizer, |
| tolerance); |
| |
| # NOTICE: * support was removed because other modules did not have it. |
| # Uncomment the following code if you want to re-add '*' support |
| |
| # Check for '*' in indepdendent variables |
| #if input_independent_varname == "*": |
| # independent_varname = _internal_return_all_except_dep_vars( |
| # schema_madlib, |
| # source_table, |
| # dependent_varname |
| # ) |
| #else: |
| |
| independent_varname = input_independent_varname |
| |
| # No grouping |
| if not input_group_cols: |
| # Step 1: Create output table with appropriate column names |
| plpy.execute(""" |
| CREATE TABLE %s ( |
| margins DOUBLE PRECISION[], |
| std_err DOUBLE PRECISION[], |
| t_stats DOUBLE PRECISION[], |
| p_values DOUBLE PRECISION[])""" % out_table |
| ) |
| |
| # Step 2: Run the regression if the coefficients are not provided |
| temp_table = __unique_string(); |
| plpy.execute(""" |
| SELECT %s.logregr_train('%s', |
| '%s', |
| '%s', |
| '%s', |
| NULL, |
| %s, |
| '%s', |
| %s) |
| """ % (schema_madlib, |
| source_table, |
| temp_table, |
| dependent_varname, |
| independent_varname, |
| max_iter, |
| optimizer, |
| tolerance)) |
| |
| regr_coef = plpy.execute('SELECT coef from %s' % temp_table) |
| regr_coef = string_to_array(regr_coef[0]['coef']) |
| |
| # If marginal variables are none. Then chose all variables |
| if not marginal_vars: |
| marginal_vars = range(1,len(regr_coef)+1) |
| else: |
| marginal_vars = map(int, string_to_array(marginal_vars)) |
| |
| plpy.execute('DROP TABLE IF EXISTS %s' % temp_table) |
| |
| # Step 3: Compute the marginal effects |
| marginal_log_rst = plpy.execute(""" |
| SELECT (%s.marginal_logregr( |
| (%s)::BOOLEAN, |
| %s, |
| %s)).* FROM %s |
| """ % ( |
| schema_madlib, |
| dependent_varname, |
| independent_varname, |
| _internal_py_array_to_sql_string(regr_coef), |
| source_table)) |
| |
| # convert SQL array to a python list |
| # (taking into account GP4.1 exception behavior) |
| margins = string_to_array(marginal_log_rst[0]["margins"]) |
| std_err = string_to_array(marginal_log_rst[0]["std_err"]) |
| p_values = string_to_array(marginal_log_rst[0]["p_values"]) |
| t_stats = string_to_array(marginal_log_rst[0]["t_stats"]) |
| |
| # get subset of indices that need to be returned as solution |
| margins = _internal_get_margins_from_array(margins, marginal_vars) |
| std_err = _internal_get_margins_from_array(std_err, marginal_vars) |
| t_stats = _internal_get_margins_from_array(t_stats, marginal_vars) |
| p_values = _internal_get_margins_from_array(p_values, marginal_vars) |
| |
| # Step 4: |
| insert_string = """ |
| INSERT INTO %s VALUES (%s, %s, %s, %s) |
| """ % (out_table, |
| _internal_py_array_to_sql_string(margins), |
| _internal_py_array_to_sql_string(std_err), |
| _internal_py_array_to_sql_string(t_stats), |
| _internal_py_array_to_sql_string(p_values)) |
| |
| |
| # Step 4: Clean up output to make sure infinity and nan are cast properly |
| insert_string = re.sub('Infinity', "'Infinity'::double precision", insert_string) |
| insert_string = re.sub('infinity', "'Infinity'::double precision", insert_string) |
| insert_string = re.sub('Nan', "'Nan'::double precision", insert_string) |
| insert_string = re.sub('nan', "'Nan'::double precision", insert_string) |
| insert_string = re.sub('NaN', "'Nan'::double precision", insert_string) |
| plpy.execute(insert_string) |
| |
| plpy.execute("SET client_min_messages TO %s" % old_msg_level) |
| |
| # ======================================================================== |
| # Input handling for multinomial logistic regression |
| def _margins_mlogregr_validate_args( |
| schema_madlib, |
| source_table, |
| out_table, |
| dependent_varname, |
| independent_varname, |
| ref_category, |
| input_group_cols, |
| marginal_vars, |
| max_iter, |
| optimizer, |
| tolerance): |
| """ |
| Validate the arguments |
| """ |
| if not source_table or source_table in ('null', '') or \ |
| (not table_exists(source_table)): |
| plpy.error("Margins error: Data table does not exist!") |
| |
| if table_is_empty(source_table): |
| plpy.error("Margins error: Data table is empty!") |
| |
| if out_table.lower() in ('null', ''): |
| plpy.error("Margins error: Invalid output table name!") |
| |
| if table_exists(out_table): |
| plpy.error("Margins error: Output table already exists!") |
| |
| if not dependent_varname or dependent_varname.lower() in ('null', ''): |
| plpy.error("Margins error: Invalid dependent column name!") |
| |
| if not scalar_col_has_no_null(source_table, dependent_varname): |
| plpy.error("Margins error: Dependent variable has Null values! \ |
| Please filter out Null values before using this function!") |
| |
| if not independent_varname or independent_varname.lower() in ('null', ''): |
| plpy.error("Margins error: Invalid independent column name!") |
| |
| if input_group_cols and input_group_cols.lower() in ('null', ''): |
| plpy.error("Margins error: Invalid grouping columns name!") |
| |
| if ref_category is None: |
| plpy.error("Margins: Reference category cannot be null!") |
| |
| if ref_category < 0: |
| plpy.error("Margins error: Reference category cannot be negative!") |
| |
| if max_iter <= 0: |
| plpy.error("Margins error: Maximum number of iterations must be positive!") |
| |
| if tolerance < 0: |
| plpy.error("Margins error: The tolerance cannot be negative!") |
| |
| if optimizer is None: |
| plpy.error(""" Margins error: Optimizer does not exist. |
| Must be 'newton'/'irls'. |
| """) |
| |
| if optimizer == "newton": |
| optimizer = "irls" |
| elif optimizer not in ("irls"): |
| plpy.error(""" Margins error: Optimizer does not exist. |
| Must be 'newton'/'irls'. |
| """) |
| if input_group_cols: |
| if not columns_exist_in_table(source_table, |
| _string_to_array(input_group_cols), schema_madlib): |
| plpy.error("Margins error: Grouping column does not exist!") |
| |
| |
| |
| # ======================================================================== |
| # Main function call for marginal logisitc regression |
| def margins_mlogregr(schema_madlib, source_table, out_table |
| , dependent_varname, input_independent_varname |
| , ref_category, input_group_cols, marginal_vars |
| , max_iter, optimizer, tolerance , **kwargs): |
| """ |
| @brief A wrapper function for the various marginal regression analyzes. |
| |
| @param source_table String identifying the input table |
| @param out_table String identifying the output table to be created |
| @param dependent_varname Column containing the dependent variable |
| @param independent_varname Column containing the array of independent variables |
| @param ref_category Reference category for multinomial logistic regression |
| @param grouping_cols Set of columns to group by. |
| @param marginal_vars Subset of independent variables to calculate marginal effects for. |
| @param max_iter Maximum number of iterations |
| @param optimzer Optimizer to be used (newton/irls, cg or idg) |
| @param tolerance Resiual tolerance |
| |
| @par |
| To include an intercept in the model, set one coordinate in the |
| <tt>independentVariables</tt> array to 1. |
| |
| @return void |
| |
| @usage |
| For function summary information. Run |
| sql> select margins_mlogregr('help'); |
| OR |
| sql> select margins_mlogregr(); |
| OR |
| sql> select margins_mlogregr('?'); |
| """ |
| # Reset the message level to avoid random messages |
| old_msg_level = plpy.execute(""" |
| SELECT setting |
| FROM pg_settings |
| WHERE name='client_min_messages' |
| """)[0]['setting'] |
| plpy.execute('SET client_min_messages TO warning') |
| |
| # Validate arguments |
| _margins_mlogregr_validate_args(schema_madlib, |
| source_table, |
| out_table, |
| dependent_varname, |
| input_independent_varname, |
| ref_category, |
| input_group_cols, |
| marginal_vars, |
| max_iter, |
| optimizer, |
| tolerance); |
| |
| # NOTICE: * support was removed because other modules did not have it. |
| # Uncomment the following code if you want to re-add '*' support |
| |
| # Check for '*' in indepdendent variables |
| #if input_independent_varname == "*": |
| # independent_varname = _internal_return_all_except_dep_vars( |
| # schema_madlib, |
| # source_table, |
| # dependent_varname |
| # ) |
| #else: |
| |
| independent_varname = input_independent_varname |
| |
| # No grouping |
| if not input_group_cols: |
| # Step 1: Create output table with appropriate column names |
| plpy.execute(""" |
| CREATE TABLE %s ( |
| margins DOUBLE PRECISION[], |
| std_err DOUBLE PRECISION[], |
| t_stats DOUBLE PRECISION[], |
| p_values DOUBLE PRECISION[])""" % out_table |
| ) |
| |
| # Step 2: Run the regression if the coefficients are not provided |
| regr_coef = plpy.execute(""" |
| SELECT coef from %s.mlogregr('%s', |
| '%s', |
| '%s', |
| %s, |
| '%s', |
| %s, |
| %s) |
| """ % (schema_madlib, |
| source_table, |
| dependent_varname, |
| independent_varname, |
| max_iter, |
| optimizer, |
| tolerance, |
| ref_category)) |
| |
| regr_coef = string_to_array(regr_coef[0]['coef']) |
| num_categories = plpy.execute(\ |
| 'SELECT count(DISTINCT %s) as num_cats FROM %s' % |
| (dependent_varname, source_table)) |
| num_categories = int(num_categories[0]['num_cats']) |
| |
| # If marginal variables are none. Then chose all variables |
| if not marginal_vars: |
| size = len(regr_coef) |
| marginal_vars = range(1,size+1) |
| else: |
| # For each independent var, get out all the indices |
| marginal_vars_only = map(int, string_to_array(marginal_vars)) |
| marginal_vars = [] |
| |
| # Calculate the number of idep_vars |
| # Note: We assume that the reference variables are not being |
| # output from the multinomial regression |
| num_indep_vars = len(regr_coef) / (num_categories-1) |
| |
| for j in range(num_categories-1): |
| for m in marginal_vars_only: |
| # Note: The marginal_vars is a base 1 array |
| marginal_vars.append(j * num_indep_vars + m) |
| |
| # Step 3: Compute the marginal effects |
| marginal_log_rst = plpy.execute(""" |
| SELECT (%s.marginal_mlogregr( |
| (%s)::INTEGER, |
| %s, |
| %s, |
| %s, |
| %s)).* FROM %s |
| """ % ( |
| schema_madlib, |
| dependent_varname, |
| num_categories, |
| ref_category, |
| independent_varname, |
| _internal_py_array_to_sql_string(regr_coef), |
| source_table)) |
| |
| margins = string_to_array(marginal_log_rst[0]["margins"]) |
| std_err = string_to_array(marginal_log_rst[0]["std_err"]) |
| p_values = string_to_array(marginal_log_rst[0]["p_values"]) |
| t_stats = string_to_array(marginal_log_rst[0]["t_stats"]) |
| |
| margins = _internal_get_margins_from_array(margins, marginal_vars) |
| std_err = _internal_get_margins_from_array(std_err, marginal_vars) |
| t_stats = _internal_get_margins_from_array(t_stats, marginal_vars) |
| p_values = _internal_get_margins_from_array(p_values, marginal_vars) |
| |
| # Step 4: Insert into table |
| insert_string = """ |
| INSERT INTO %s VALUES (%s, %s, %s, %s) |
| """ % (out_table, |
| _internal_py_array_to_sql_string(margins), |
| _internal_py_array_to_sql_string(std_err), |
| _internal_py_array_to_sql_string(t_stats), |
| _internal_py_array_to_sql_string(p_values)) |
| |
| |
| # Step 5: Clean up output to make sure infinity and nan are cast properly |
| insert_string = re.sub('Infinity', "'Infinity'::double precision", insert_string) |
| insert_string = re.sub('inf', "'Infinity'::double precision", insert_string) |
| insert_string = re.sub('Nan', "'Nan'::double precision", insert_string) |
| plpy.execute(insert_string) |
| |
| plpy.execute("SET client_min_messages TO %s" % old_msg_level) |
| |