| /* ----------------------------------------------------------------------- *//** |
| * |
| * @file marginal.sql_in |
| * |
| * @brief SQL functions for linear regression |
| * @date January 2011 |
| * |
| * @sa Calculates marginal effects for various regression models. |
| * |
| *//* ----------------------------------------------------------------------- */ |
| |
| m4_include(`SQLCommon.m4') |
| |
| |
| /** |
| @addtogroup grp_marginal |
| |
| @about |
| |
| A marginal effect (ME) or partial effect measures the effect on the |
| conditional mean of \f$ y \f$ of a change in one of the regressors, say |
| \f$X_k\f$. In the linear regression model, the ME equals the |
| relevant slope coefficient, greatly simplifying analysis. For nonlinear models, |
| we require specialized algorithms for calculating ME. |
| |
| |
| The standard approach to modeling dichotomous/binary variables |
| (so \f$y \in \{0, 1\} \f$) is to estimate a generalized linear model under the |
| assumption that \f$ y \f$ follows some form of Bernoulli distribution. Thus the expected |
| value of \f$ y \f$ becomes, |
| \f[ |
| y = G(X' \beta), |
| \f] |
| |
| where G is the specified binomial distribution. For |
| logistic regression, the funciton \f$ G \f$ represents the inverse logit function. |
| |
| In logistic regression: |
| \f[ |
| P = \frac{1}{1 + e^{-(\beta_0 + \beta_1 x_1 + \dots \beta_j x_j)}} |
| = \frac{1}{1 + e^{-z}} |
| \implies \frac{\partial P}{\partial X_k} = \beta_k \cdot \frac{1}{1 + e^{-z}} \cdot |
| \frac{e^{-z}}{1 + e^{-z}} \\ |
| = \beta_k \cdot P \cdot (1-P) |
| \f] |
| |
| There are several methods for calculating the marginal effects for dichotomous |
| dependent variables. This package uses the average of the marginal effects at |
| every sample observation. |
| |
| This is calculated as follows: |
| \f[ |
| \frac{\partial y}{\partial x_k} = \beta_k \frac{\sum_{i=1}^n P(y_i = 1)(1-P(y_i = 1))}{n}, \\ |
| \text{where}, P(y_i=1) = g(X^{(i)}\beta) |
| \f] |
| |
| We use the delta method for calculating standard errors on the marginal effects. |
| |
| @input |
| |
| The input parameters are expected to be of the following form: |
| <pre> <em>margins_logregr</em> ( |
| <em>sourceTable</em> VARCHAR, |
| <em>outputTable</em> VARCHAR, |
| <em>dependentVariable</em> VARCHAR, |
| <em>independentVariable</em> VARCHAR, |
| [<em>groupingCol</em> VARCHAR, |
| <em>marginal_vars</em> INTEGER[], |
| <em>max_iter</em> INTEGER, |
| <em>optimizer</em> VARCHAR, |
| <em>tolerance</em> DOUBLE PRECISION |
| ] |
| )</pre> |
| |
| For multinomial logistic regression the input parameters are expected to be of the following form: |
| <pre> <em>margins_mlogregr</em> ( |
| <em>sourceTable</em> VARCHAR, |
| <em>outputTable</em> VARCHAR, |
| <em>dependentVariable</em> VARCHAR, |
| <em>independentVariable</em> VARCHAR, |
| [<em>groupingCol</em> VARCHAR, |
| <em>referenceCategory</em> INTEGER, |
| <em>marginal_vars</em> INTEGER[], |
| <em>max_iter</em> INTEGER, |
| <em>optimizer</em> VARCHAR, |
| <em>tolerance</em> DOUBLE PRECISION |
| ] |
| )</pre> |
| |
| \warning The \b 'groupingCol' input parameter for \e margins_logregr and \e margins_mlogregr |
| is a placeholder in the Madlib V1.0. These inputs will be implemented in a future release. |
| @usage |
| |
| <b> The Interface</b> |
| |
| For logistic regression, one can call the following function |
| <pre> |
| SELECT madlib.\ref margins_logregr( |
| <em>'source_table'</em>, -- name of input table, VARCHAR |
| <em>'out_table'</em>, -- name of output table, VARCHAR |
| <em>'dependent_varname'</em>, -- dependent variable, VARCHAR |
| <em>'independent_varname'</em>, -- independent variable, VARCHAR |
| [ <em>'grouping_cols'</em>, -- comma separated list of grouping vars, VARCHAR (Default NULL) |
| <em>'marginal_effect_variables'</em>, -- Index list (base 1) with positions representing which marginal variable to calculate, INTEGER[] (Default NULL) |
| <em> max_iter</em>, -- Maximum number of iterations to run the logistic regression INTEGER (Default 20) |
| <em>'optimizer'</em>, -- Optimizer used for logistic regression VARCHAR (Default & recommended 'irls') |
| <em>tolerance</em>, -- Tolerance for the logistic regression DOUBLE PRECISION (default 1e-4) |
| ] |
| ); |
| </pre> |
| |
| For multinomial logistic regression, one can call the following function |
| <pre> |
| SELECT madlib.\refmargins_mlogregr( |
| <em>'source_table'</em>, -- name of input table, VARCHAR |
| <em>'out_table'</em>, -- name of output table, VARCHAR |
| <em>'dependent_varname'</em>, -- dependent variable, VARCHAR |
| <em>'independent_varname'</em>, -- independent variable, VARCHAR |
| [ <em>'reference_category'</em>, -- Reference category for multinomial logistic regression INTEGER |
| <em>'grouping_cols'</em>, -- comma separated list of grouping vars, VARCHAR (Default NULL) |
| <em>'marginal_effect_variables'</em>, -- Index list (base 1) with positions representing which marginal variable to calculate, INTEGER[] (Default NULL) |
| <em> max_iter</em>, -- Maximum number of iterations to run the logistic regression INTEGER (Default 20) |
| <em>'optimizer'</em>, -- Optimizer used for logistic regression VARCHAR (Default & recommended 'irls') |
| <em>tolerance</em>, -- Tolerance for the logistic regression DOUBLE PRECISION (default 1e-4) |
| ] |
| ); |
| </pre> |
| |
| Output is stored in the <em>out_table</em>: |
| <pre> |
| [ margins | std_err | t_stats | p_values | |
| +------+---------+-------+----------+ |
| </pre> |
| |
| <b> Multinomial-Logistic Regression Notes</b> |
| |
| - The reference category ranges from [0, numCategories-1]. The default reference |
| category is zero. |
| |
| - The marginal effects and supporting statistics are output in the following order. |
| For a problem with K dependent variables (1, ..., K) and J categories (0,..., J-1). |
| The output is \f$ \{ m_{k_1, j_0} \ldots m_{k_K, j_{0}}, |
| m_{k_1, j_1} \ldots m_{k_K, j_{1}}, m_{k_1, j_{J-1}} \ldots m_{k_K, j_{J-1}} \} \f$. |
| The order is NOT CONSISTENT with mlogregr, robust_variance_mlogregr and clustered_variance_mlogregr. |
| This is deliberate because the interfaces of all the other functions will be moved |
| to match that used in marginal. |
| |
| - Selectively picking the marginal effects of the independent variables will return |
| the marginal effects of ALL categories associated with the independent variable. |
| |
| - Here the <em>'independent_varname'</em> can be the name of a column, which contains |
| array of numeric values. It can also have a format of string 'array[1, x1, x2, x3]', |
| where <em>x1</em>, <em>x2</em> and <em>x3</em> are all column names. |
| |
| - Here the <em>'vars'</em> is an index list (base 1) with the set of indices |
| amongst the independent variables that must be selected. |
| One can also chose <em>NULL<\em> to pick all indepependent variables. |
| |
| @examp |
| -# Create the sample data set: |
| @verbatim |
| sql> SELECT * FROM data; |
| id | second_attack | treatment | trait_anxiety |
| ----+---------------+-----------+--------------- |
| 1 | 1 | 1 | 70 |
| 3 | 1 | 1 | 50 |
| 5 | 1 | 0 | 40 |
| 7 | 1 | 0 | 75 |
| 9 | 1 | 0 | 70 |
| 11 | 0 | 1 | 65 |
| 13 | 0 | 1 | 45 |
| 15 | 0 | 1 | 40 |
| 17 | 0 | 0 | 55 |
| ... |
| @endverbatim |
| -# For function summary information. Run |
| @verbatim |
| sql> select margins_logregr('help'); |
| OR |
| sql> select margins_logregr(); |
| OR |
| sql> select margins_logregr(''); |
| @endverbatim |
| |
| -# Run the logistic regression function and then compute the marginal effects of all variables in the regression: |
| @verbatim |
| sql> select margins_logregr('patients', 'result_table', 'second_attack', 'ARRAY[1, treatment, trait_anxiety]'); |
| sql> select * from result_table; |
| margins | {-0.970665392796,-0.156214190168,0.0181587690137} |
| coef | {-6.36346994178179,-1.02410605239327,0.119044916668605} |
| std_err | {0.802871454422,0.292691682191,0.0137459874022} |
| t_stats | {-1.2089922832,-0.533715850748,1.32102325446} |
| p_values | {0.243212810329,0.600447858606,0.204000202116} |
| |
| |
| -# Alternate Syntax: Run the logistic regression function and then compute the marginal effects if all variables in the regression: |
| sql> select margins_logregr('patients', 'result_table', 'second_attack', 'ARRAY[1, treatment, trait_anxiety]', NULL, NULL); |
| sql> select * from result_table; |
| margins | {-0.970665392796,-0.156214190168,0.0181587690137} |
| coef | {-6.36346994178179,-1.02410605239327,0.119044916668605} |
| std_err | {0.802871454422,0.292691682191,0.0137459874022} |
| t_stats | {-1.2089922832,-0.533715850748,1.32102325446} |
| p_values | {0.243212810329,0.600447858606,0.204000202116} |
| |
| |
| -# Run the logistic regression function and then compute the marginal effects of the first variable in the regression |
| sql> select margins_logregr('patients', 'result_table', 'second_attack', 'ARRAY[1, treatment, trait_anxiety]', NULL, ARRAY[1]); |
| sql> select * from result_table; |
| margins | {-0.970665392796} |
| coef | {-6.36346994178179} |
| std_err | {0.802871454422} |
| t_stats | {-1.2089922832} |
| p_values | {0.243212810329} |
| @endverbatim |
| |
| @examp |
| -# Create the sample data set (the full dataset has 3 categories): |
| @verbatim |
| sql> SELECT * FROM data; |
| id | feature_1 | feature_2 | category |
| ----+---------------+-----------+--------------- |
| 1 | 1 | 35 | 1 |
| 3 | 2 | 33 | 0 |
| 5 | 3 | 39 | 1 |
| 7 | 1 | 37 | 1 |
| 9 | 2 | 31 | 1 |
| 11 | 3 | 36 | 1 |
| 13 | 2 | 36 | 1 |
| 15 | 2 | 36 | 0 |
| 17 | 2 | 31 | 5 |
| ... |
| @endverbatim |
| -# For function summary information. Run |
| @verbatim |
| sql> select margins_mlogregr('help'); |
| OR |
| sql> select margins_mlogregr(); |
| OR |
| sql> select margins_mlogregr(''); |
| @endverbatim |
| |
| -# Run the regression function and then compute the marginal effects of all variables in the regression (see docs for detailed order) |
| @verbatim |
| sql> select margins_mlogregr('test_data', 'result_table', 'category', 'ARRAY[1, feature_1, feature_2]'); |
| sql> select * from result_table; |
| margins | {0.741613239156,-0.032868883552,-0.0144502990691,-0.972055011831,0.112337273885,0.0172621628253} |
| std_err | {0.183172236055,0.044184899499,0.00332608999704,0.263532615748,0.0555196094594,0.00457999429836} |
| t_stats | {4.04872078394,-0.743894043547,-4.34453038911,-3.68855676202,2.02338011702,3.76903587663} |
| p_values | {7.43784735554e-05,0.457840607871,2.24855476205e-05,0.000292799037776,0.0444060346517,0.000217384008015} |
| @endverbatim |
| |
| -# Run the regression and compute the marginals effects for the first dependent variable (all categories) |
| @verbatim |
| sql> select margins_mlogregr('test_data', 'result_table', 'category', 'ARRAY[1, feature_1, feature_2]', 0, NULL, ARRAY[1]); |
| sql> select * from result_table; |
| margins | {0.741613239156,-0.972055011831} |
| std_err | {0.183172236055,0.263532615748} |
| t_stats | {4.04872078394,-3.68855676202} |
| p_values | {7.43784735554e-05,0.000292799037776} |
| @endverbatim |
| |
| @verbatim |
| -# For function usage information. |
| sql> select margins_mlogregr('usage'); |
| @endverbatim |
| |
| @literature |
| |
| |
| [1] mfx function in STATA: http://www.stata.com/help.cgi?mfx_option |
| |
| @sa File marginal.sql_in documenting the SQL functions. |
| |
| @internal |
| @sa Namespace \ref madlib::modules::regress |
| documenting the implementation in C++ |
| @endinternal |
| */ |
| |
| |
| |
| |
| ------------------ Marginal Logistic Regression ------------------------------ |
| |
| CREATE TYPE MADLIB_SCHEMA.marginal_logregr_result AS ( |
| margins DOUBLE PRECISION[], |
| coef DOUBLE PRECISION[], |
| std_err DOUBLE PRECISION[], |
| t_stats DOUBLE PRECISION[], |
| p_values DOUBLE PRECISION[] |
| ); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.marginal_logregr_step_transition( |
| state DOUBLE PRECISION[], |
| y BOOLEAN, |
| x DOUBLE PRECISION[], |
| coef DOUBLE PRECISION[]) |
| RETURNS DOUBLE PRECISION[] |
| AS 'MODULE_PATHNAME' |
| LANGUAGE C IMMUTABLE; |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.marginal_logregr_step_merge_states( |
| state1 DOUBLE PRECISION[], |
| state2 DOUBLE PRECISION[]) |
| RETURNS DOUBLE PRECISION[] |
| AS 'MODULE_PATHNAME' |
| LANGUAGE C IMMUTABLE STRICT; |
| |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.marginal_logregr_step_final( |
| state DOUBLE PRECISION[]) |
| RETURNS MADLIB_SCHEMA.marginal_logregr_result |
| AS 'MODULE_PATHNAME' |
| LANGUAGE C IMMUTABLE STRICT; |
| |
| |
| /** |
| * @brief Compute marginal effects for logistic regression. |
| * |
| * @param dependentVariable Column containing the dependent variable |
| * @param independentVariables Column containing the array of independent variables |
| * @param coef Column containing the array of the coefficients (as obtained by logregr) |
| * |
| * @par |
| * To include an intercept in the model, set one coordinate in the |
| * <tt>independentVariables</tt> array to 1. |
| * |
| * @return A composite value: |
| * - <tt>margins FLOAT8[] </tt> - Array of marginal effects |
| * - <tt>coefFLOAT8[] </tt> - The coefficients for the regression |
| * - <tt>std_err FLOAT8[]</tt> - Array of standard-errors (calculated by the delta method), |
| * - <tt>t_stats FLOAT8[]</tt> - Array of t-statistics |
| * - <tt>p_values FLOAT8[]</tt> - Array of p-values |
| * |
| * @usage |
| * - Get all the diagnostic statistics:\n |
| * |
| * <pre> SELECT marginal_logregr(<em>dependentVariable</em>, |
| * <em>independentVariables</em>, coef) |
| * FROM <em>dataTable</em>; |
| * </pre> |
| */ |
| |
| CREATE AGGREGATE MADLIB_SCHEMA.marginal_logregr( |
| /*+ "dependentVariable" */ BOOLEAN, |
| /*+ "independentVariables" */ DOUBLE PRECISION[], |
| /*+ "coef" */ DOUBLE PRECISION[] )( |
| STYPE=DOUBLE PRECISION[], |
| SFUNC=MADLIB_SCHEMA.marginal_logregr_step_transition, |
| m4_ifdef(`__GREENPLUM__',`PREFUNC=MADLIB_SCHEMA.marginal_logregr_step_merge_states,') |
| FINALFUNC=MADLIB_SCHEMA.marginal_logregr_step_final, |
| INITCOND='{0,0,0,0,0,0}' |
| ); |
| |
| |
| --------------------------- Interface ---------------------------------- |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.margins_logregr( |
| usage_string VARCHAR -- usage string |
| ) |
| RETURNS VARCHAR AS $$ |
| DECLARE |
| insert_string VARCHAR; |
| BEGIN |
| IF (usage_string = '' OR usage_string = 'help' OR usage_string = '?') THEN |
| insert_string := '' || |
| E'Summary \n' || |
| E'-----------------------------------------------------------------------------------------\n' || |
| E' Functionality: Calculate marginal effects for logistic regression\n' || |
| E' The funciton first runs the regression and then calculates the \n' || |
| E' coefficients and uses them to calculate the marginal effects \n' || |
| E' SELECT {schema_madlib}.margins_logregr(''source_table'' \n' || |
| E' ,''output_table'' \n' || |
| E' ,''dependent_variable'' \n' || |
| E' ,''independent_variable'' \n' || |
| E' [,''grouping_cols'' \n' || |
| E' , marginal_variables \n' || |
| E' , max_iter \n' || |
| E' ,''optimizer'' \n' || |
| E' , tolerance \n' || |
| E' , verbose_mode \n' || |
| E' ]' || |
| E' );\n' || |
| E'For more details on function usage: \n' || |
| E'SELECT {schema_madlib}.margins_logregr(''usage'') \n' || |
| E''; |
| ElSIF (usage_string = 'usage') THEN |
| insert_string := '' || |
| E'-----------------------------------------------------------------------------------------\n' || |
| E'Usage\n' || |
| E'-----------------------------------------------------------------------------------------\n' || |
| E' SELECT {schema_madlib}.margins_logregr( \n' || |
| E' ''source_table'', -- Name of data table \n' || |
| E' ''output_table'', -- Name of result table (overwrites if exists) \n' || |
| E' ''dependent_variable'', -- Name of column for dependent variables\n' || |
| E' ''independent_variable'', -- Name of column for independent variables\n' || |
| E' (can be any SQL expression that evaluates to an array)\n' || |
| E'[ ''group_cols'', -- Comma separated string with columns to group by\n' || |
| E' (Optional : Default NULL => no grouping) \n' || |
| E' ''margins_vars'', -- Index list (base 1) representing the independent variables to compute marginal effects on \n' || |
| E' (Optional : Default NULL which computes marginal effects on all variables) \n' || |
| E' max_iter, -- Maximum number of iterations for the logistic regression\n' || |
| E' (Optional : Default 20) \n' || |
| E' ''optimizer'', -- Optimizer for logistic regression (newton/irls, cg or igd)\n' || |
| E' (Optional : Default irls) \n' || |
| E' tolerance, -- Termination criterion for logistic regression (relative)\n' || |
| E' (Optional : Default 1e-4) \n' || |
| E' verbose_mode, -- Verbose mode \n' || |
| E' (Optional : Default False) \n' || |
| E']' || |
| E' );\n' || |
| E'\n' || |
| E'-----------------------------------------------------------------------------------------\n' || |
| E'Output:\n' || |
| E'-----------------------------------------------------------------------------------------\n' || |
| E' The output table (''output_table'' above) has the following columns\n' || |
| E' ''margins'' DOUBLE PRECISION[], -- Marginal effects \n' || |
| E' ''std_err'' DOUBLE PRECISION[], -- Standard errors using the delta method\n' || |
| E' ''t_stats'' DOUBLE PRECISION[], -- T-stats of the marginal effects\n' || |
| E' ''p_values'' DOUBLE PRECISION[], -- p-values of the marginal effects\n' || |
| E'\n' || |
| E''; |
| ELSE |
| insert_string := 'No such option. Run SELECT {schema_madlib}.margins_logregr()'; |
| END IF; |
| RETURN insert_string; |
| END; |
| $$ LANGUAGE plpgsql VOLATILE; |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.margins_logregr() |
| RETURNS VARCHAR AS $$ |
| BEGIN |
| RETURN MADLIB_SCHEMA.margins_logregr(''); |
| END; |
| $$ LANGUAGE plpgsql VOLATILE; |
| |
| |
| /** |
| @brief A wrapper function for the various marginal regression analyzes. |
| * |
| * @param source_table String identifying the input table |
| * @param out_table String identifying the output table to be created |
| * @param dependent_varname Column containing the dependent variable |
| * @param input_independent_varname Column containing the array of independent variables |
| * @param grouping_cols Set of columns to group by. |
| * @param marginal_vars Subset of independent variables to calculate marginal effects for. |
| * @param max_iter Maximum number of iterations |
| * @param optimzer Optimizer to be used (newton/irls, cg or idg) |
| * @param tolerance Resiual tolerance |
| * @param verbose_mode Verbose mode (on/off) |
| * |
| * @par |
| * To include an intercept in the model, set one coordinate in the |
| * <tt>independentVariables</tt> array to 1. |
| * |
| * @return void |
| * |
| * @usage |
| * For function summary information. Run |
| * sql> select margins_logregr('help'); |
| * OR |
| * sql> select margins_logregr(); |
| * OR |
| * sql> select margins_logregr('?'); |
| * For function usage information. Run |
| * sql> select margins_logregr('usage'); |
| * - Compute the coefficients, and the get the marginal diagnostic statistics: |
| * <pre> |
| * select margins_logregr(source_table, out_table, dependentVariable, independentVariables); |
| * </pre> |
| * |
| */ |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.margins_logregr( |
| source_table VARCHAR -- name of input table |
| , out_table VARCHAR -- name of output table |
| , dependent_varname VARCHAR -- name of dependent variable |
| , input_independent_varname VARCHAR -- name of independent variable |
| , input_group_cols VARCHAR -- names of columns to group by |
| , marginal_vars INTEGER[] -- indices of variables to calculate marginal effects on |
| , max_iter INTEGER -- Max iterations for the logstic regression inner call |
| , optimizer VARCHAR -- Logistic regression optimizer |
| , tolerance DOUBLE PRECISION -- Tolerance |
| , verbose_mode BOOLEAN -- Verbose mode |
| ) |
| RETURNS VOID AS $$ |
| PythonFunction(regress, marginal, margins_logregr) |
| $$ LANGUAGE plpythonu; |
| |
| |
| |
| -- Default Variable calls for margins_logregr |
| ------------------------------------------------------------------------------ |
| |
| /** |
| * @brief Marginal effects with default variables |
| **/ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.margins_logregr( |
| source_table VARCHAR -- name of input table |
| , out_table VARCHAR -- name of output table |
| , dependent_variable VARCHAR -- name of dependent variable |
| , independent_variable VARCHAR -- name of independent variable |
| ) |
| RETURNS VOID AS $$ |
| BEGIN |
| PERFORM MADLIB_SCHEMA.margins_logregr( |
| source_table, |
| out_table, |
| dependent_variable, |
| independent_variable, |
| NULL, |
| NULL, |
| 20, |
| 'irls', |
| 0.0001, |
| False); |
| END; |
| $$ LANGUAGE plpgsql VOLATILE; |
| |
| /** |
| * @brief Marginal effects with default variable_names |
| **/ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.margins_logregr( |
| source_table VARCHAR -- name of input table |
| , out_table VARCHAR -- name of output table |
| , dependent_variable VARCHAR -- name of dependent variable |
| , independent_variable VARCHAR -- name of independent variable |
| , grouping_cols VARCHAR -- name of grouping cols |
| ) |
| RETURNS VOID AS $$ |
| BEGIN |
| PERFORM MADLIB_SCHEMA.margins_logregr( |
| source_table, |
| out_table, |
| dependent_variable, |
| independent_variable, |
| grouping_cols, |
| NULL, |
| 20, |
| 'irls', |
| 0.0001, |
| False); |
| END; |
| $$ LANGUAGE plpgsql VOLATILE; |
| |
| /** |
| * @brief Marginal effects with default variable_names |
| **/ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.margins_logregr( |
| source_table VARCHAR -- name of input table |
| , out_table VARCHAR -- name of output table |
| , dependent_variable VARCHAR -- name of dependent variable |
| , independent_variable VARCHAR -- name of independent variable |
| , grouping_cols VARCHAR -- name of grouping cols |
| , marginal_vars INTEGER[] -- indices of variables to calculate marginal effects on |
| ) |
| RETURNS VOID AS $$ |
| BEGIN |
| PERFORM MADLIB_SCHEMA.margins_logregr( |
| source_table, |
| out_table, |
| dependent_variable, |
| independent_variable, |
| grouping_cols, |
| marginal_vars, |
| 20, |
| 'irls', |
| 0.0001, |
| False); |
| END; |
| $$ LANGUAGE plpgsql VOLATILE; |
| |
| /** |
| * @brief Marginal effects with default variable_names |
| **/ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.margins_logregr( |
| source_table VARCHAR -- name of input table |
| , out_table VARCHAR -- name of output table |
| , dependent_variable VARCHAR -- name of dependent variable |
| , independent_variable VARCHAR -- name of independent variable |
| , grouping_cols VARCHAR -- name of grouping cols |
| , marginal_vars INTEGER[] -- indices of variables to calculate marginal effects on |
| , max_iter INTEGER -- Max iterations for the logstic regression inner call |
| ) |
| RETURNS VOID AS $$ |
| BEGIN |
| PERFORM MADLIB_SCHEMA.margins_logregr( |
| source_table, |
| out_table, |
| dependent_variable, |
| independent_variable, |
| grouping_cols, |
| marginal_vars, |
| max_iter, |
| 'irls', |
| 0.0001, |
| False); |
| END; |
| $$ LANGUAGE plpgsql VOLATILE; |
| |
| |
| /** |
| * @brief Marginal effects with default variable_names |
| **/ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.margins_logregr( |
| source_table VARCHAR -- name of input table |
| , out_table VARCHAR -- name of output table |
| , dependent_variable VARCHAR -- name of dependent variable |
| , independent_variable VARCHAR -- name of independent variable |
| , grouping_cols VARCHAR -- name of grouping cols |
| , marginal_vars INTEGER[] -- indices of variables to calculate marginal effects on |
| , max_iter INTEGER -- Max iterations for the logstic regression inner call |
| , optimizer VARCHAR -- Logistic regression optimizer |
| ) |
| RETURNS VOID AS $$ |
| BEGIN |
| PERFORM MADLIB_SCHEMA.margins_logregr( |
| source_table, |
| out_table, |
| dependent_variable, |
| independent_variable, |
| grouping_cols, |
| marginal_vars, |
| max_iter, |
| optimizer, |
| 0.0001, |
| False); |
| END; |
| $$ LANGUAGE plpgsql VOLATILE; |
| |
| /** |
| * @brief Marginal effects with default variable_names |
| **/ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.margins_logregr( |
| source_table VARCHAR -- name of input table |
| , out_table VARCHAR -- name of output table |
| , dependent_variable VARCHAR -- name of dependent variable |
| , independent_variable VARCHAR -- name of independent variable |
| , grouping_cols VARCHAR -- name of grouping cols |
| , marginal_vars INTEGER[] -- indices of variables to calculate marginal effects on |
| , max_iter INTEGER -- Max iterations for the logstic regression inner call |
| , optimizer VARCHAR -- Logistic regression optimizer |
| , tolerance DOUBLE PRECISION -- Tolerance |
| ) |
| RETURNS VOID AS $$ |
| BEGIN |
| PERFORM MADLIB_SCHEMA.margins_logregr( |
| source_table, |
| out_table, |
| dependent_variable, |
| independent_variable, |
| grouping_cols, |
| marginal_vars, |
| max_iter, |
| optimizer, |
| tolerance, |
| False); |
| END; |
| $$ LANGUAGE plpgsql VOLATILE; |
| -- End of Default Variable calls for margins_logregr |
| ------------------------------------------------------------------------------ |
| |
| |
| |
| ------------------ Marginal Multi-Logistic Regression ------------------------------ |
| |
| CREATE TYPE MADLIB_SCHEMA.marginal_mlogregr_result AS ( |
| margins DOUBLE PRECISION[], |
| coef DOUBLE PRECISION[], |
| std_err DOUBLE PRECISION[], |
| t_stats DOUBLE PRECISION[], |
| p_values DOUBLE PRECISION[] |
| ); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlogregr_marginal_step_transition( |
| state DOUBLE PRECISION[], |
| y INTEGER, |
| num_categories INTEGER, |
| ref_category INTEGER, |
| x DOUBLE PRECISION[], |
| coef DOUBLE PRECISION[]) |
| RETURNS DOUBLE PRECISION[] |
| AS 'MODULE_PATHNAME' |
| LANGUAGE C IMMUTABLE; |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlogregr_marginal_step_merge_states( |
| state1 DOUBLE PRECISION[], |
| state2 DOUBLE PRECISION[]) |
| RETURNS DOUBLE PRECISION[] |
| AS 'MODULE_PATHNAME' |
| LANGUAGE C IMMUTABLE STRICT; |
| |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlogregr_marginal_step_final( |
| state DOUBLE PRECISION[]) |
| RETURNS MADLIB_SCHEMA.marginal_mlogregr_result |
| AS 'MODULE_PATHNAME' |
| LANGUAGE C IMMUTABLE STRICT; |
| |
| |
| /** |
| * @brief Compute marginal effects for multinomial logistic regression. |
| * |
| * @param dependentVariable Column containing the dependent variable |
| * @param independentVariables Column containing the array of independent variables |
| * @param num_categories Number of categories |
| * @param ref_category Reference category |
| * @param coef Column containing the array of the coefficients (as obtained by mlogregr) |
| * |
| * @par |
| * To include an intercept in the model, set one coordinate in the |
| * <tt>independentVariables</tt> array to 1. |
| * |
| * @return A composite value: |
| * - <tt>margins FLOAT8[] </tt> - Array of marginal effects |
| * - <tt>coef FLOAT8[] </tt> - The coefficients for the regression |
| * - <tt>std_err FLOAT8[]</tt> - Array of standard-errors (using the delta method), |
| * - <tt>t_stats FLOAT8[]</tt> - Array of t-statistics |
| * - <tt>p_values FLOAT8[]</tt> - Array of p-values |
| * |
| * @usage |
| * - Get all the diagnostic statistics:\n |
| * |
| * <pre> SELECT marginal_mlogregr(<em>dependentVariable</em>, |
| * <em>independentVariables</em>, coef) |
| * FROM <em>dataTable</em>; |
| * </pre> |
| */ |
| |
| CREATE AGGREGATE MADLIB_SCHEMA.marginal_mlogregr( |
| /*+ "dependentVariable" */ INTEGER, |
| /*+ "num_categories" */ INTEGER, |
| /*+ "ref_category" */ INTEGER, |
| /*+ "independentVariables" */ DOUBLE PRECISION[], |
| /*+ "coef" */ DOUBLE PRECISION[] )( |
| STYPE=DOUBLE PRECISION[], |
| SFUNC=MADLIB_SCHEMA.mlogregr_marginal_step_transition, |
| m4_ifdef(`__GREENPLUM__',`PREFUNC=MADLIB_SCHEMA.mlogregr_marginal_step_merge_states,') |
| FINALFUNC=MADLIB_SCHEMA.mlogregr_marginal_step_final, |
| INITCOND='{0,0,0,0,0,0}' |
| ); |
| |
| |
| --------------------------- Interface ---------------------------------- |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.margins_mlogregr( |
| usage_string VARCHAR -- usage string |
| ) |
| RETURNS VARCHAR AS $$ |
| DECLARE |
| insert_string VARCHAR; |
| BEGIN |
| IF (usage_string = '' OR usage_string = 'help' OR usage_string = '?') THEN |
| insert_string := '' || |
| E'Summary \n' || |
| E'-----------------------------------------------------------------------------------------\n' || |
| E' Functionality: Calculate marginal effects for logistic regression\n' || |
| E' The funciton first runs the regression and then calculates the \n' || |
| E' coefficients and uses them to calculate the marginal effects \n' || |
| E' SELECT {schema_madlib}.margins_mlogregr(''source_table'' \n' || |
| E' ,''output_table'' \n' || |
| E' ,''dependent_variable'' \n' || |
| E' ,''independent_variable'' \n' || |
| E' [, ref_category \n' || |
| E' ,''grouping_cols'' \n' || |
| E' ,''marginal_variables'' \n' || |
| E' , max_iter \n' || |
| E' ,''optimizer'' \n' || |
| E' , tolerance \n' || |
| E' , verbose_mode \n' || |
| E' ]' || |
| E' );\n' || |
| E'For more details on function usage: \n' || |
| E'SELECT {schema_madlib}.margins_mlogregr(''usage'') \n' || |
| E''; |
| ElSIF (usage_string = 'usage') THEN |
| insert_string := '' || |
| E'-----------------------------------------------------------------------------------------\n' || |
| E'Usage\n' || |
| E'-----------------------------------------------------------------------------------------\n' || |
| E' SELECT {schema_madlib}.margins_mlogregr( \n' || |
| E' ''source_table'', -- Name of data table \n' || |
| E' ''output_table'', -- Name of result table (overwrites if exists) \n' || |
| E' ''dependent_variable'', -- Name of column for dependent variables\n' || |
| E' ''independent_variable'', -- Name of column for independent variables\n' || |
| E' (can be any SQL expression that evaluates to an array)\n' || |
| E'[ ref_category, -- Reference category for the multinomial logisitic regressionby\n' || |
| E' (Optional : Default 0) \n' || |
| E' ''group_cols'', -- Comma separated string with columns to group by\n' || |
| E' (Optional : Default NULL => no grouping) \n' || |
| E' margins_vars, -- Index list (base 1) representing the independent variables to compute marginal effects on \n' || |
| E' (Optional : Default NULL which computes marginal effects on all variables) \n' || |
| E' max_iter, -- Maximum number of iterations for the logistic regression\n' || |
| E' (Optional : Default 20) \n' || |
| E' ''optimizer'', -- Optimizer for logistic regression (newton/irls, cg or igd)\n' || |
| E' (Optional : Default irls) \n' || |
| E' tolerance, -- Termination criterion for logistic regression (relative)\n' || |
| E' (Optional : Default 1e-4) \n' || |
| E' verbose_mode, -- Verbose mode \n' || |
| E' (Optional : Default False) \n' || |
| E']' || |
| E' );\n' || |
| E'\n' || |
| E'-----------------------------------------------------------------------------------------\n' || |
| E'Output:\n' || |
| E'-----------------------------------------------------------------------------------------\n' || |
| E' The output table (''output_table'' above) has the following columns\n' || |
| E' ''margins'' DOUBLE PRECISION[], -- Marginal effects \n' || |
| E' ''std_err'' DOUBLE PRECISION[], -- Standard errors using the delta method\n' || |
| E' ''t_stats'' DOUBLE PRECISION[], -- T-stats of the standard errors\n' || |
| E' ''p_values'' DOUBLE PRECISION[], -- p-values of the standard errors\n' || |
| E'\n' || |
| E''; |
| ELSE |
| insert_string := 'No such option. Run SELECT {schema_madlib}.margins_mlogregr()'; |
| END IF; |
| RETURN insert_string; |
| END; |
| $$ LANGUAGE plpgsql VOLATILE; |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.margins_mlogregr() |
| RETURNS VARCHAR AS $$ |
| BEGIN |
| RETURN MADLIB_SCHEMA.margins_mlogregr(''); |
| END; |
| $$ LANGUAGE plpgsql VOLATILE; |
| |
| |
| /** |
| @brief A wrapper function for the various marginal multinomial regression |
| * |
| * @param source_table String identifying the input table |
| * @param out_table String identifying the output table to be created |
| * @param dependent_varname Column containing the dependent variable |
| * @param input_independent_varname Column containing the array of independent variables |
| * @param ref_category Reference category for the multinomial logistic regression |
| * @param grouping_cols Set of columns to group by. |
| * @param marginal_vars Subset of independent variables to calculate marginal effects for. |
| * @param max_iter Maximum number of iterations |
| * @param optimzer Optimizer to be used (newton/irls, cg or idg) |
| * @param tolerance Resiual tolerance |
| * @param verbose_mode Verbose mode (on/off) |
| * |
| * @par |
| * To include an intercept in the model, set one coordinate in the |
| * <tt>independentVariables</tt> array to 1. |
| * |
| * @return void |
| * |
| * @usage |
| * For function summary information. Run |
| * sql> select margins_mlogregr('help'); |
| * OR |
| * sql> select margins_mlogregr(); |
| * OR |
| * sql> select margins_mlogregr('?'); |
| * For function usage information. Run |
| * sql> select margins_mlogregr('usage'); |
| * |
| */ |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.margins_mlogregr( |
| source_table VARCHAR -- name of input table |
| , out_table VARCHAR -- name of output table |
| , dependent_varname VARCHAR -- name of dependent variable |
| , input_independent_varname VARCHAR -- name of independent variable |
| , ref_category INTEGER -- reference category |
| , input_group_cols VARCHAR -- names of columns to group by |
| , marginal_vars INTEGER[] -- indices of variables to calculate marginal effects on |
| , max_iter INTEGER -- Max iterations for the logstic regression inner call |
| , optimizer VARCHAR -- Logistic regression optimizer |
| , tolerance DOUBLE PRECISION -- Tolerance |
| , verbose_mode BOOLEAN -- Verbose mode |
| ) |
| RETURNS VOID AS $$ |
| PythonFunction(regress, marginal, margins_mlogregr) |
| $$ LANGUAGE plpythonu; |
| |
| |
| |
| -- Default Variable calls for margins_mlogregr |
| ------------------------------------------------------------------------------ |
| |
| /** |
| * @brief Marginal effects with default variables |
| **/ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.margins_mlogregr( |
| source_table VARCHAR -- name of input table |
| , out_table VARCHAR -- name of output table |
| , dependent_variable VARCHAR -- name of dependent variable |
| , independent_variable VARCHAR -- name of independent variable |
| ) |
| RETURNS VOID AS $$ |
| BEGIN |
| PERFORM MADLIB_SCHEMA.margins_mlogregr( |
| source_table, |
| out_table, |
| dependent_variable, |
| independent_variable, |
| 0, |
| NULL, |
| NULL, |
| 20, |
| 'irls', |
| 0.0001, |
| False); |
| END; |
| $$ LANGUAGE plpgsql VOLATILE; |
| |
| /** |
| * @brief Marginal effects with default variables |
| **/ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.margins_mlogregr( |
| source_table VARCHAR -- name of input table |
| , out_table VARCHAR -- name of output table |
| , dependent_variable VARCHAR -- name of dependent variable |
| , independent_variable VARCHAR -- name of independent variable |
| , ref_category INTEGER -- reference category |
| ) |
| RETURNS VOID AS $$ |
| BEGIN |
| PERFORM MADLIB_SCHEMA.margins_mlogregr( |
| source_table, |
| out_table, |
| dependent_variable, |
| independent_variable, |
| ref_category, |
| NULL, |
| NULL, |
| 20, |
| 'irls', |
| 0.0001, |
| False); |
| END; |
| $$ LANGUAGE plpgsql VOLATILE; |
| |
| /** |
| * @brief Marginal effects with default variable_names |
| **/ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.margins_mlogregr( |
| source_table VARCHAR -- name of input table |
| , out_table VARCHAR -- name of output table |
| , dependent_variable VARCHAR -- name of dependent variable |
| , independent_variable VARCHAR -- name of independent variable |
| , ref_category INTEGER -- reference category |
| , grouping_cols VARCHAR -- name of grouping cols |
| ) |
| RETURNS VOID AS $$ |
| BEGIN |
| PERFORM MADLIB_SCHEMA.margins_mlogregr( |
| source_table, |
| out_table, |
| dependent_variable, |
| independent_variable, |
| ref_category, |
| grouping_cols, |
| NULL, |
| 20, |
| 'irls', |
| 0.0001, |
| False); |
| END; |
| $$ LANGUAGE plpgsql VOLATILE; |
| |
| /** |
| * @brief Marginal effects with default variable_names |
| **/ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.margins_mlogregr( |
| source_table VARCHAR -- name of input table |
| , out_table VARCHAR -- name of output table |
| , dependent_variable VARCHAR -- name of dependent variable |
| , independent_variable VARCHAR -- name of independent variable |
| , ref_category INTEGER -- reference category |
| , grouping_cols VARCHAR -- name of grouping cols |
| , marginal_vars INTEGER[] -- indices of variables to calculate marginal effects on |
| ) |
| RETURNS VOID AS $$ |
| BEGIN |
| PERFORM MADLIB_SCHEMA.margins_mlogregr( |
| source_table, |
| out_table, |
| dependent_variable, |
| independent_variable, |
| ref_category, |
| grouping_cols, |
| marginal_vars, |
| 20, |
| 'irls', |
| 0.0001, |
| False); |
| END; |
| $$ LANGUAGE plpgsql VOLATILE; |
| |
| /** |
| * @brief Marginal effects with default variable_names |
| **/ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.margins_mlogregr( |
| source_table VARCHAR -- name of input table |
| , out_table VARCHAR -- name of output table |
| , dependent_variable VARCHAR -- name of dependent variable |
| , independent_variable VARCHAR -- name of independent variable |
| , ref_category INTEGER -- reference category |
| , grouping_cols VARCHAR -- name of grouping cols |
| , marginal_vars INTEGER[] -- indices of variables to calculate marginal effects on |
| , max_iter INTEGER -- Max iterations for the logstic regression inner call |
| ) |
| RETURNS VOID AS $$ |
| BEGIN |
| PERFORM MADLIB_SCHEMA.margins_mlogregr( |
| source_table, |
| out_table, |
| dependent_variable, |
| independent_variable, |
| ref_category, |
| grouping_cols, |
| marginal_vars, |
| max_iter, |
| 'irls', |
| 0.0001, |
| False); |
| END; |
| $$ LANGUAGE plpgsql VOLATILE; |
| |
| |
| /** |
| * @brief Marginal effects with default variable_names |
| **/ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.margins_mlogregr( |
| source_table VARCHAR -- name of input table |
| , out_table VARCHAR -- name of output table |
| , dependent_variable VARCHAR -- name of dependent variable |
| , independent_variable VARCHAR -- name of independent variable |
| , ref_category INTEGER -- reference category |
| , grouping_cols VARCHAR -- name of grouping cols |
| , marginal_vars INTEGER[] -- indices of variables to calculate marginal effects on |
| , max_iter INTEGER -- Max iterations for the logstic regression inner call |
| , optimizer VARCHAR -- Logistic regression optimizer |
| ) |
| RETURNS VOID AS $$ |
| BEGIN |
| PERFORM MADLIB_SCHEMA.margins_mlogregr( |
| source_table, |
| out_table, |
| dependent_variable, |
| independent_variable, |
| ref_category, |
| grouping_cols, |
| marginal_vars, |
| max_iter, |
| optimizer, |
| 0.0001, |
| False); |
| END; |
| $$ LANGUAGE plpgsql VOLATILE; |
| |
| /** |
| * @brief Marginal effects with default variable_names |
| **/ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.margins_mlogregr( |
| source_table VARCHAR -- name of input table |
| , out_table VARCHAR -- name of output table |
| , dependent_variable VARCHAR -- name of dependent variable |
| , independent_variable VARCHAR -- name of independent variable |
| , ref_category INTEGER -- reference category |
| , grouping_cols VARCHAR -- name of grouping cols |
| , marginal_vars INTEGER[] -- indices of variables to calculate marginal effects on |
| , max_iter INTEGER -- Max iterations for the logstic regression inner call |
| , optimizer VARCHAR -- Logistic regression optimizer |
| , tolerance DOUBLE PRECISION -- Tolerance |
| ) |
| RETURNS VOID AS $$ |
| BEGIN |
| PERFORM MADLIB_SCHEMA.margins_mlogregr( |
| source_table, |
| out_table, |
| dependent_variable, |
| independent_variable, |
| ref_category, |
| grouping_cols, |
| marginal_vars, |
| max_iter, |
| optimizer, |
| tolerance, |
| False); |
| END; |
| $$ LANGUAGE plpgsql VOLATILE; |
| -- End of Default Variable calls for margins_mlogregr |
| ------------------------------------------------------------------------------ |