blob: 6974782df76579ce5663fa2edf6f281d73a5ede1 [file] [log] [blame]
/* ----------------------------------------------------------------------- *//**
*
* @file marginal.sql_in
*
* @brief SQL functions for linear regression
* @date January 2011
*
* @sa Calculates marginal effects for various regression models.
*
*//* ----------------------------------------------------------------------- */
m4_include(`SQLCommon.m4')
/**
@addtogroup grp_marginal
@about
A marginal effect (ME) or partial effect measures the effect on the
conditional mean of \f$ y \f$ of a change in one of the regressors, say
\f$X_k\f$. In the linear regression model, the ME equals the
relevant slope coefficient, greatly simplifying analysis. For nonlinear models,
we require specialized algorithms for calculating ME.
The standard approach to modeling dichotomous/binary variables
(so \f$y \in \{0, 1\} \f$) is to estimate a generalized linear model under the
assumption that \f$ y \f$ follows some form of Bernoulli distribution. Thus the expected
value of \f$ y \f$ becomes,
\f[
y = G(X' \beta),
\f]
where G is the specified binomial distribution. For
logistic regression, the funciton \f$ G \f$ represents the inverse logit function.
In logistic regression:
\f[
P = \frac{1}{1 + e^{-(\beta_0 + \beta_1 x_1 + \dots \beta_j x_j)}}
= \frac{1}{1 + e^{-z}}
\implies \frac{\partial P}{\partial X_k} = \beta_k \cdot \frac{1}{1 + e^{-z}} \cdot
\frac{e^{-z}}{1 + e^{-z}} \\
= \beta_k \cdot P \cdot (1-P)
\f]
There are several methods for calculating the marginal effects for dichotomous
dependent variables. This package uses the average of the marginal effects at
every sample observation.
This is calculated as follows:
\f[
\frac{\partial y}{\partial x_k} = \beta_k \frac{\sum_{i=1}^n P(y_i = 1)(1-P(y_i = 1))}{n}, \\
\text{where}, P(y_i=1) = g(X^{(i)}\beta)
\f]
We use the delta method for calculating standard errors on the marginal effects.
@input
The input parameters are expected to be of the following form:
<pre> <em>margins_logregr</em> (
<em>sourceTable</em> VARCHAR,
<em>outputTable</em> VARCHAR,
<em>dependentVariable</em> VARCHAR,
<em>independentVariable</em> VARCHAR,
[<em>groupingCol</em> VARCHAR,
<em>marginal_vars</em> INTEGER[],
<em>max_iter</em> INTEGER,
<em>optimizer</em> VARCHAR,
<em>tolerance</em> DOUBLE PRECISION
]
)</pre>
For multinomial logistic regression the input parameters are expected to be of the following form:
<pre> <em>margins_mlogregr</em> (
<em>sourceTable</em> VARCHAR,
<em>outputTable</em> VARCHAR,
<em>dependentVariable</em> VARCHAR,
<em>independentVariable</em> VARCHAR,
[<em>groupingCol</em> VARCHAR,
<em>referenceCategory</em> INTEGER,
<em>marginal_vars</em> INTEGER[],
<em>max_iter</em> INTEGER,
<em>optimizer</em> VARCHAR,
<em>tolerance</em> DOUBLE PRECISION
]
)</pre>
\warning The \b 'groupingCol' input parameter for \e margins_logregr and \e margins_mlogregr
is a placeholder in the Madlib V1.0. These inputs will be implemented in a future release.
@usage
<b> The Interface</b>
For logistic regression, one can call the following function
<pre>
SELECT madlib.\ref margins_logregr(
<em>'source_table'</em>, -- name of input table, VARCHAR
<em>'out_table'</em>, -- name of output table, VARCHAR
<em>'dependent_varname'</em>, -- dependent variable, VARCHAR
<em>'independent_varname'</em>, -- independent variable, VARCHAR
[ <em>'grouping_cols'</em>, -- comma separated list of grouping vars, VARCHAR (Default NULL)
<em>'marginal_effect_variables'</em>, -- Index list (base 1) with positions representing which marginal variable to calculate, INTEGER[] (Default NULL)
<em> max_iter</em>, -- Maximum number of iterations to run the logistic regression INTEGER (Default 20)
<em>'optimizer'</em>, -- Optimizer used for logistic regression VARCHAR (Default & recommended 'irls')
<em>tolerance</em>, -- Tolerance for the logistic regression DOUBLE PRECISION (default 1e-4)
]
);
</pre>
For multinomial logistic regression, one can call the following function
<pre>
SELECT madlib.\refmargins_mlogregr(
<em>'source_table'</em>, -- name of input table, VARCHAR
<em>'out_table'</em>, -- name of output table, VARCHAR
<em>'dependent_varname'</em>, -- dependent variable, VARCHAR
<em>'independent_varname'</em>, -- independent variable, VARCHAR
[ <em>'reference_category'</em>, -- Reference category for multinomial logistic regression INTEGER
<em>'grouping_cols'</em>, -- comma separated list of grouping vars, VARCHAR (Default NULL)
<em>'marginal_effect_variables'</em>, -- Index list (base 1) with positions representing which marginal variable to calculate, INTEGER[] (Default NULL)
<em> max_iter</em>, -- Maximum number of iterations to run the logistic regression INTEGER (Default 20)
<em>'optimizer'</em>, -- Optimizer used for logistic regression VARCHAR (Default & recommended 'irls')
<em>tolerance</em>, -- Tolerance for the logistic regression DOUBLE PRECISION (default 1e-4)
]
);
</pre>
Output is stored in the <em>out_table</em>:
<pre>
[ margins | std_err | t_stats | p_values |
+------+---------+-------+----------+
</pre>
<b> Multinomial-Logistic Regression Notes</b>
- The reference category ranges from [0, numCategories-1]. The default reference
category is zero.
- The marginal effects and supporting statistics are output in the following order.
For a problem with K dependent variables (1, ..., K) and J categories (0,..., J-1).
The output is \f$ \{ m_{k_1, j_0} \ldots m_{k_K, j_{0}},
m_{k_1, j_1} \ldots m_{k_K, j_{1}}, m_{k_1, j_{J-1}} \ldots m_{k_K, j_{J-1}} \} \f$.
The order is NOT CONSISTENT with mlogregr, robust_variance_mlogregr and clustered_variance_mlogregr.
This is deliberate because the interfaces of all the other functions will be moved
to match that used in marginal.
- Selectively picking the marginal effects of the independent variables will return
the marginal effects of ALL categories associated with the independent variable.
- Here the <em>'independent_varname'</em> can be the name of a column, which contains
array of numeric values. It can also have a format of string 'array[1, x1, x2, x3]',
where <em>x1</em>, <em>x2</em> and <em>x3</em> are all column names.
- Here the <em>'vars'</em> is an index list (base 1) with the set of indices
amongst the independent variables that must be selected.
One can also chose <em>NULL<\em> to pick all indepependent variables.
@examp
-# Create the sample data set:
@verbatim
sql> SELECT * FROM data;
id | second_attack | treatment | trait_anxiety
----+---------------+-----------+---------------
1 | 1 | 1 | 70
3 | 1 | 1 | 50
5 | 1 | 0 | 40
7 | 1 | 0 | 75
9 | 1 | 0 | 70
11 | 0 | 1 | 65
13 | 0 | 1 | 45
15 | 0 | 1 | 40
17 | 0 | 0 | 55
...
@endverbatim
-# For function summary information. Run
@verbatim
sql> select margins_logregr('help');
OR
sql> select margins_logregr();
OR
sql> select margins_logregr('');
@endverbatim
-# Run the logistic regression function and then compute the marginal effects of all variables in the regression:
@verbatim
sql> select margins_logregr('patients', 'result_table', 'second_attack', 'ARRAY[1, treatment, trait_anxiety]');
sql> select * from result_table;
margins | {-0.970665392796,-0.156214190168,0.0181587690137}
coef | {-6.36346994178179,-1.02410605239327,0.119044916668605}
std_err | {0.802871454422,0.292691682191,0.0137459874022}
t_stats | {-1.2089922832,-0.533715850748,1.32102325446}
p_values | {0.243212810329,0.600447858606,0.204000202116}
-# Alternate Syntax: Run the logistic regression function and then compute the marginal effects if all variables in the regression:
sql> select margins_logregr('patients', 'result_table', 'second_attack', 'ARRAY[1, treatment, trait_anxiety]', NULL, NULL);
sql> select * from result_table;
margins | {-0.970665392796,-0.156214190168,0.0181587690137}
coef | {-6.36346994178179,-1.02410605239327,0.119044916668605}
std_err | {0.802871454422,0.292691682191,0.0137459874022}
t_stats | {-1.2089922832,-0.533715850748,1.32102325446}
p_values | {0.243212810329,0.600447858606,0.204000202116}
-# Run the logistic regression function and then compute the marginal effects of the first variable in the regression
sql> select margins_logregr('patients', 'result_table', 'second_attack', 'ARRAY[1, treatment, trait_anxiety]', NULL, ARRAY[1]);
sql> select * from result_table;
margins | {-0.970665392796}
coef | {-6.36346994178179}
std_err | {0.802871454422}
t_stats | {-1.2089922832}
p_values | {0.243212810329}
@endverbatim
@examp
-# Create the sample data set (the full dataset has 3 categories):
@verbatim
sql> SELECT * FROM data;
id | feature_1 | feature_2 | category
----+---------------+-----------+---------------
1 | 1 | 35 | 1
3 | 2 | 33 | 0
5 | 3 | 39 | 1
7 | 1 | 37 | 1
9 | 2 | 31 | 1
11 | 3 | 36 | 1
13 | 2 | 36 | 1
15 | 2 | 36 | 0
17 | 2 | 31 | 5
...
@endverbatim
-# For function summary information. Run
@verbatim
sql> select margins_mlogregr('help');
OR
sql> select margins_mlogregr();
OR
sql> select margins_mlogregr('');
@endverbatim
-# Run the regression function and then compute the marginal effects of all variables in the regression (see docs for detailed order)
@verbatim
sql> select margins_mlogregr('test_data', 'result_table', 'category', 'ARRAY[1, feature_1, feature_2]');
sql> select * from result_table;
margins | {0.741613239156,-0.032868883552,-0.0144502990691,-0.972055011831,0.112337273885,0.0172621628253}
std_err | {0.183172236055,0.044184899499,0.00332608999704,0.263532615748,0.0555196094594,0.00457999429836}
t_stats | {4.04872078394,-0.743894043547,-4.34453038911,-3.68855676202,2.02338011702,3.76903587663}
p_values | {7.43784735554e-05,0.457840607871,2.24855476205e-05,0.000292799037776,0.0444060346517,0.000217384008015}
@endverbatim
-# Run the regression and compute the marginals effects for the first dependent variable (all categories)
@verbatim
sql> select margins_mlogregr('test_data', 'result_table', 'category', 'ARRAY[1, feature_1, feature_2]', 0, NULL, ARRAY[1]);
sql> select * from result_table;
margins | {0.741613239156,-0.972055011831}
std_err | {0.183172236055,0.263532615748}
t_stats | {4.04872078394,-3.68855676202}
p_values | {7.43784735554e-05,0.000292799037776}
@endverbatim
@verbatim
-# For function usage information.
sql> select margins_mlogregr('usage');
@endverbatim
@literature
[1] mfx function in STATA: http://www.stata.com/help.cgi?mfx_option
@sa File marginal.sql_in documenting the SQL functions.
@internal
@sa Namespace \ref madlib::modules::regress
documenting the implementation in C++
@endinternal
*/
------------------ Marginal Logistic Regression ------------------------------
CREATE TYPE MADLIB_SCHEMA.marginal_logregr_result AS (
margins DOUBLE PRECISION[],
coef DOUBLE PRECISION[],
std_err DOUBLE PRECISION[],
t_stats DOUBLE PRECISION[],
p_values DOUBLE PRECISION[]
);
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.marginal_logregr_step_transition(
state DOUBLE PRECISION[],
y BOOLEAN,
x DOUBLE PRECISION[],
coef DOUBLE PRECISION[])
RETURNS DOUBLE PRECISION[]
AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE;
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.marginal_logregr_step_merge_states(
state1 DOUBLE PRECISION[],
state2 DOUBLE PRECISION[])
RETURNS DOUBLE PRECISION[]
AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE STRICT;
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.marginal_logregr_step_final(
state DOUBLE PRECISION[])
RETURNS MADLIB_SCHEMA.marginal_logregr_result
AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE STRICT;
/**
* @brief Compute marginal effects for logistic regression.
*
* @param dependentVariable Column containing the dependent variable
* @param independentVariables Column containing the array of independent variables
* @param coef Column containing the array of the coefficients (as obtained by logregr)
*
* @par
* To include an intercept in the model, set one coordinate in the
* <tt>independentVariables</tt> array to 1.
*
* @return A composite value:
* - <tt>margins FLOAT8[] </tt> - Array of marginal effects
* - <tt>coefFLOAT8[] </tt> - The coefficients for the regression
* - <tt>std_err FLOAT8[]</tt> - Array of standard-errors (calculated by the delta method),
* - <tt>t_stats FLOAT8[]</tt> - Array of t-statistics
* - <tt>p_values FLOAT8[]</tt> - Array of p-values
*
* @usage
* - Get all the diagnostic statistics:\n
*
* <pre> SELECT marginal_logregr(<em>dependentVariable</em>,
* <em>independentVariables</em>, coef)
* FROM <em>dataTable</em>;
* </pre>
*/
CREATE AGGREGATE MADLIB_SCHEMA.marginal_logregr(
/*+ "dependentVariable" */ BOOLEAN,
/*+ "independentVariables" */ DOUBLE PRECISION[],
/*+ "coef" */ DOUBLE PRECISION[] )(
STYPE=DOUBLE PRECISION[],
SFUNC=MADLIB_SCHEMA.marginal_logregr_step_transition,
m4_ifdef(`__GREENPLUM__',`PREFUNC=MADLIB_SCHEMA.marginal_logregr_step_merge_states,')
FINALFUNC=MADLIB_SCHEMA.marginal_logregr_step_final,
INITCOND='{0,0,0,0,0,0}'
);
--------------------------- Interface ----------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.margins_logregr(
usage_string VARCHAR -- usage string
)
RETURNS VARCHAR AS $$
DECLARE
insert_string VARCHAR;
BEGIN
IF (usage_string = '' OR usage_string = 'help' OR usage_string = '?') THEN
insert_string := '' ||
E'Summary \n' ||
E'-----------------------------------------------------------------------------------------\n' ||
E' Functionality: Calculate marginal effects for logistic regression\n' ||
E' The funciton first runs the regression and then calculates the \n' ||
E' coefficients and uses them to calculate the marginal effects \n' ||
E' SELECT {schema_madlib}.margins_logregr(''source_table'' \n' ||
E' ,''output_table'' \n' ||
E' ,''dependent_variable'' \n' ||
E' ,''independent_variable'' \n' ||
E' [,''grouping_cols'' \n' ||
E' , marginal_variables \n' ||
E' , max_iter \n' ||
E' ,''optimizer'' \n' ||
E' , tolerance \n' ||
E' , verbose_mode \n' ||
E' ]' ||
E' );\n' ||
E'For more details on function usage: \n' ||
E'SELECT {schema_madlib}.margins_logregr(''usage'') \n' ||
E'';
ElSIF (usage_string = 'usage') THEN
insert_string := '' ||
E'-----------------------------------------------------------------------------------------\n' ||
E'Usage\n' ||
E'-----------------------------------------------------------------------------------------\n' ||
E' SELECT {schema_madlib}.margins_logregr( \n' ||
E' ''source_table'', -- Name of data table \n' ||
E' ''output_table'', -- Name of result table (overwrites if exists) \n' ||
E' ''dependent_variable'', -- Name of column for dependent variables\n' ||
E' ''independent_variable'', -- Name of column for independent variables\n' ||
E' (can be any SQL expression that evaluates to an array)\n' ||
E'[ ''group_cols'', -- Comma separated string with columns to group by\n' ||
E' (Optional : Default NULL => no grouping) \n' ||
E' ''margins_vars'', -- Index list (base 1) representing the independent variables to compute marginal effects on \n' ||
E' (Optional : Default NULL which computes marginal effects on all variables) \n' ||
E' max_iter, -- Maximum number of iterations for the logistic regression\n' ||
E' (Optional : Default 20) \n' ||
E' ''optimizer'', -- Optimizer for logistic regression (newton/irls, cg or igd)\n' ||
E' (Optional : Default irls) \n' ||
E' tolerance, -- Termination criterion for logistic regression (relative)\n' ||
E' (Optional : Default 1e-4) \n' ||
E' verbose_mode, -- Verbose mode \n' ||
E' (Optional : Default False) \n' ||
E']' ||
E' );\n' ||
E'\n' ||
E'-----------------------------------------------------------------------------------------\n' ||
E'Output:\n' ||
E'-----------------------------------------------------------------------------------------\n' ||
E' The output table (''output_table'' above) has the following columns\n' ||
E' ''margins'' DOUBLE PRECISION[], -- Marginal effects \n' ||
E' ''std_err'' DOUBLE PRECISION[], -- Standard errors using the delta method\n' ||
E' ''t_stats'' DOUBLE PRECISION[], -- T-stats of the marginal effects\n' ||
E' ''p_values'' DOUBLE PRECISION[], -- p-values of the marginal effects\n' ||
E'\n' ||
E'';
ELSE
insert_string := 'No such option. Run SELECT {schema_madlib}.margins_logregr()';
END IF;
RETURN insert_string;
END;
$$ LANGUAGE plpgsql VOLATILE;
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.margins_logregr()
RETURNS VARCHAR AS $$
BEGIN
RETURN MADLIB_SCHEMA.margins_logregr('');
END;
$$ LANGUAGE plpgsql VOLATILE;
/**
@brief A wrapper function for the various marginal regression analyzes.
*
* @param source_table String identifying the input table
* @param out_table String identifying the output table to be created
* @param dependent_varname Column containing the dependent variable
* @param input_independent_varname Column containing the array of independent variables
* @param grouping_cols Set of columns to group by.
* @param marginal_vars Subset of independent variables to calculate marginal effects for.
* @param max_iter Maximum number of iterations
* @param optimzer Optimizer to be used (newton/irls, cg or idg)
* @param tolerance Resiual tolerance
* @param verbose_mode Verbose mode (on/off)
*
* @par
* To include an intercept in the model, set one coordinate in the
* <tt>independentVariables</tt> array to 1.
*
* @return void
*
* @usage
* For function summary information. Run
* sql> select margins_logregr('help');
* OR
* sql> select margins_logregr();
* OR
* sql> select margins_logregr('?');
* For function usage information. Run
* sql> select margins_logregr('usage');
* - Compute the coefficients, and the get the marginal diagnostic statistics:
* <pre>
* select margins_logregr(source_table, out_table, dependentVariable, independentVariables);
* </pre>
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.margins_logregr(
source_table VARCHAR -- name of input table
, out_table VARCHAR -- name of output table
, dependent_varname VARCHAR -- name of dependent variable
, input_independent_varname VARCHAR -- name of independent variable
, input_group_cols VARCHAR -- names of columns to group by
, marginal_vars INTEGER[] -- indices of variables to calculate marginal effects on
, max_iter INTEGER -- Max iterations for the logstic regression inner call
, optimizer VARCHAR -- Logistic regression optimizer
, tolerance DOUBLE PRECISION -- Tolerance
, verbose_mode BOOLEAN -- Verbose mode
)
RETURNS VOID AS $$
PythonFunction(regress, marginal, margins_logregr)
$$ LANGUAGE plpythonu;
-- Default Variable calls for margins_logregr
------------------------------------------------------------------------------
/**
* @brief Marginal effects with default variables
**/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.margins_logregr(
source_table VARCHAR -- name of input table
, out_table VARCHAR -- name of output table
, dependent_variable VARCHAR -- name of dependent variable
, independent_variable VARCHAR -- name of independent variable
)
RETURNS VOID AS $$
BEGIN
PERFORM MADLIB_SCHEMA.margins_logregr(
source_table,
out_table,
dependent_variable,
independent_variable,
NULL,
NULL,
20,
'irls',
0.0001,
False);
END;
$$ LANGUAGE plpgsql VOLATILE;
/**
* @brief Marginal effects with default variable_names
**/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.margins_logregr(
source_table VARCHAR -- name of input table
, out_table VARCHAR -- name of output table
, dependent_variable VARCHAR -- name of dependent variable
, independent_variable VARCHAR -- name of independent variable
, grouping_cols VARCHAR -- name of grouping cols
)
RETURNS VOID AS $$
BEGIN
PERFORM MADLIB_SCHEMA.margins_logregr(
source_table,
out_table,
dependent_variable,
independent_variable,
grouping_cols,
NULL,
20,
'irls',
0.0001,
False);
END;
$$ LANGUAGE plpgsql VOLATILE;
/**
* @brief Marginal effects with default variable_names
**/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.margins_logregr(
source_table VARCHAR -- name of input table
, out_table VARCHAR -- name of output table
, dependent_variable VARCHAR -- name of dependent variable
, independent_variable VARCHAR -- name of independent variable
, grouping_cols VARCHAR -- name of grouping cols
, marginal_vars INTEGER[] -- indices of variables to calculate marginal effects on
)
RETURNS VOID AS $$
BEGIN
PERFORM MADLIB_SCHEMA.margins_logregr(
source_table,
out_table,
dependent_variable,
independent_variable,
grouping_cols,
marginal_vars,
20,
'irls',
0.0001,
False);
END;
$$ LANGUAGE plpgsql VOLATILE;
/**
* @brief Marginal effects with default variable_names
**/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.margins_logregr(
source_table VARCHAR -- name of input table
, out_table VARCHAR -- name of output table
, dependent_variable VARCHAR -- name of dependent variable
, independent_variable VARCHAR -- name of independent variable
, grouping_cols VARCHAR -- name of grouping cols
, marginal_vars INTEGER[] -- indices of variables to calculate marginal effects on
, max_iter INTEGER -- Max iterations for the logstic regression inner call
)
RETURNS VOID AS $$
BEGIN
PERFORM MADLIB_SCHEMA.margins_logregr(
source_table,
out_table,
dependent_variable,
independent_variable,
grouping_cols,
marginal_vars,
max_iter,
'irls',
0.0001,
False);
END;
$$ LANGUAGE plpgsql VOLATILE;
/**
* @brief Marginal effects with default variable_names
**/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.margins_logregr(
source_table VARCHAR -- name of input table
, out_table VARCHAR -- name of output table
, dependent_variable VARCHAR -- name of dependent variable
, independent_variable VARCHAR -- name of independent variable
, grouping_cols VARCHAR -- name of grouping cols
, marginal_vars INTEGER[] -- indices of variables to calculate marginal effects on
, max_iter INTEGER -- Max iterations for the logstic regression inner call
, optimizer VARCHAR -- Logistic regression optimizer
)
RETURNS VOID AS $$
BEGIN
PERFORM MADLIB_SCHEMA.margins_logregr(
source_table,
out_table,
dependent_variable,
independent_variable,
grouping_cols,
marginal_vars,
max_iter,
optimizer,
0.0001,
False);
END;
$$ LANGUAGE plpgsql VOLATILE;
/**
* @brief Marginal effects with default variable_names
**/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.margins_logregr(
source_table VARCHAR -- name of input table
, out_table VARCHAR -- name of output table
, dependent_variable VARCHAR -- name of dependent variable
, independent_variable VARCHAR -- name of independent variable
, grouping_cols VARCHAR -- name of grouping cols
, marginal_vars INTEGER[] -- indices of variables to calculate marginal effects on
, max_iter INTEGER -- Max iterations for the logstic regression inner call
, optimizer VARCHAR -- Logistic regression optimizer
, tolerance DOUBLE PRECISION -- Tolerance
)
RETURNS VOID AS $$
BEGIN
PERFORM MADLIB_SCHEMA.margins_logregr(
source_table,
out_table,
dependent_variable,
independent_variable,
grouping_cols,
marginal_vars,
max_iter,
optimizer,
tolerance,
False);
END;
$$ LANGUAGE plpgsql VOLATILE;
-- End of Default Variable calls for margins_logregr
------------------------------------------------------------------------------
------------------ Marginal Multi-Logistic Regression ------------------------------
CREATE TYPE MADLIB_SCHEMA.marginal_mlogregr_result AS (
margins DOUBLE PRECISION[],
coef DOUBLE PRECISION[],
std_err DOUBLE PRECISION[],
t_stats DOUBLE PRECISION[],
p_values DOUBLE PRECISION[]
);
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlogregr_marginal_step_transition(
state DOUBLE PRECISION[],
y INTEGER,
num_categories INTEGER,
ref_category INTEGER,
x DOUBLE PRECISION[],
coef DOUBLE PRECISION[])
RETURNS DOUBLE PRECISION[]
AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE;
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlogregr_marginal_step_merge_states(
state1 DOUBLE PRECISION[],
state2 DOUBLE PRECISION[])
RETURNS DOUBLE PRECISION[]
AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE STRICT;
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlogregr_marginal_step_final(
state DOUBLE PRECISION[])
RETURNS MADLIB_SCHEMA.marginal_mlogregr_result
AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE STRICT;
/**
* @brief Compute marginal effects for multinomial logistic regression.
*
* @param dependentVariable Column containing the dependent variable
* @param independentVariables Column containing the array of independent variables
* @param num_categories Number of categories
* @param ref_category Reference category
* @param coef Column containing the array of the coefficients (as obtained by mlogregr)
*
* @par
* To include an intercept in the model, set one coordinate in the
* <tt>independentVariables</tt> array to 1.
*
* @return A composite value:
* - <tt>margins FLOAT8[] </tt> - Array of marginal effects
* - <tt>coef FLOAT8[] </tt> - The coefficients for the regression
* - <tt>std_err FLOAT8[]</tt> - Array of standard-errors (using the delta method),
* - <tt>t_stats FLOAT8[]</tt> - Array of t-statistics
* - <tt>p_values FLOAT8[]</tt> - Array of p-values
*
* @usage
* - Get all the diagnostic statistics:\n
*
* <pre> SELECT marginal_mlogregr(<em>dependentVariable</em>,
* <em>independentVariables</em>, coef)
* FROM <em>dataTable</em>;
* </pre>
*/
CREATE AGGREGATE MADLIB_SCHEMA.marginal_mlogregr(
/*+ "dependentVariable" */ INTEGER,
/*+ "num_categories" */ INTEGER,
/*+ "ref_category" */ INTEGER,
/*+ "independentVariables" */ DOUBLE PRECISION[],
/*+ "coef" */ DOUBLE PRECISION[] )(
STYPE=DOUBLE PRECISION[],
SFUNC=MADLIB_SCHEMA.mlogregr_marginal_step_transition,
m4_ifdef(`__GREENPLUM__',`PREFUNC=MADLIB_SCHEMA.mlogregr_marginal_step_merge_states,')
FINALFUNC=MADLIB_SCHEMA.mlogregr_marginal_step_final,
INITCOND='{0,0,0,0,0,0}'
);
--------------------------- Interface ----------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.margins_mlogregr(
usage_string VARCHAR -- usage string
)
RETURNS VARCHAR AS $$
DECLARE
insert_string VARCHAR;
BEGIN
IF (usage_string = '' OR usage_string = 'help' OR usage_string = '?') THEN
insert_string := '' ||
E'Summary \n' ||
E'-----------------------------------------------------------------------------------------\n' ||
E' Functionality: Calculate marginal effects for logistic regression\n' ||
E' The funciton first runs the regression and then calculates the \n' ||
E' coefficients and uses them to calculate the marginal effects \n' ||
E' SELECT {schema_madlib}.margins_mlogregr(''source_table'' \n' ||
E' ,''output_table'' \n' ||
E' ,''dependent_variable'' \n' ||
E' ,''independent_variable'' \n' ||
E' [, ref_category \n' ||
E' ,''grouping_cols'' \n' ||
E' ,''marginal_variables'' \n' ||
E' , max_iter \n' ||
E' ,''optimizer'' \n' ||
E' , tolerance \n' ||
E' , verbose_mode \n' ||
E' ]' ||
E' );\n' ||
E'For more details on function usage: \n' ||
E'SELECT {schema_madlib}.margins_mlogregr(''usage'') \n' ||
E'';
ElSIF (usage_string = 'usage') THEN
insert_string := '' ||
E'-----------------------------------------------------------------------------------------\n' ||
E'Usage\n' ||
E'-----------------------------------------------------------------------------------------\n' ||
E' SELECT {schema_madlib}.margins_mlogregr( \n' ||
E' ''source_table'', -- Name of data table \n' ||
E' ''output_table'', -- Name of result table (overwrites if exists) \n' ||
E' ''dependent_variable'', -- Name of column for dependent variables\n' ||
E' ''independent_variable'', -- Name of column for independent variables\n' ||
E' (can be any SQL expression that evaluates to an array)\n' ||
E'[ ref_category, -- Reference category for the multinomial logisitic regressionby\n' ||
E' (Optional : Default 0) \n' ||
E' ''group_cols'', -- Comma separated string with columns to group by\n' ||
E' (Optional : Default NULL => no grouping) \n' ||
E' margins_vars, -- Index list (base 1) representing the independent variables to compute marginal effects on \n' ||
E' (Optional : Default NULL which computes marginal effects on all variables) \n' ||
E' max_iter, -- Maximum number of iterations for the logistic regression\n' ||
E' (Optional : Default 20) \n' ||
E' ''optimizer'', -- Optimizer for logistic regression (newton/irls, cg or igd)\n' ||
E' (Optional : Default irls) \n' ||
E' tolerance, -- Termination criterion for logistic regression (relative)\n' ||
E' (Optional : Default 1e-4) \n' ||
E' verbose_mode, -- Verbose mode \n' ||
E' (Optional : Default False) \n' ||
E']' ||
E' );\n' ||
E'\n' ||
E'-----------------------------------------------------------------------------------------\n' ||
E'Output:\n' ||
E'-----------------------------------------------------------------------------------------\n' ||
E' The output table (''output_table'' above) has the following columns\n' ||
E' ''margins'' DOUBLE PRECISION[], -- Marginal effects \n' ||
E' ''std_err'' DOUBLE PRECISION[], -- Standard errors using the delta method\n' ||
E' ''t_stats'' DOUBLE PRECISION[], -- T-stats of the standard errors\n' ||
E' ''p_values'' DOUBLE PRECISION[], -- p-values of the standard errors\n' ||
E'\n' ||
E'';
ELSE
insert_string := 'No such option. Run SELECT {schema_madlib}.margins_mlogregr()';
END IF;
RETURN insert_string;
END;
$$ LANGUAGE plpgsql VOLATILE;
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.margins_mlogregr()
RETURNS VARCHAR AS $$
BEGIN
RETURN MADLIB_SCHEMA.margins_mlogregr('');
END;
$$ LANGUAGE plpgsql VOLATILE;
/**
@brief A wrapper function for the various marginal multinomial regression
*
* @param source_table String identifying the input table
* @param out_table String identifying the output table to be created
* @param dependent_varname Column containing the dependent variable
* @param input_independent_varname Column containing the array of independent variables
* @param ref_category Reference category for the multinomial logistic regression
* @param grouping_cols Set of columns to group by.
* @param marginal_vars Subset of independent variables to calculate marginal effects for.
* @param max_iter Maximum number of iterations
* @param optimzer Optimizer to be used (newton/irls, cg or idg)
* @param tolerance Resiual tolerance
* @param verbose_mode Verbose mode (on/off)
*
* @par
* To include an intercept in the model, set one coordinate in the
* <tt>independentVariables</tt> array to 1.
*
* @return void
*
* @usage
* For function summary information. Run
* sql> select margins_mlogregr('help');
* OR
* sql> select margins_mlogregr();
* OR
* sql> select margins_mlogregr('?');
* For function usage information. Run
* sql> select margins_mlogregr('usage');
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.margins_mlogregr(
source_table VARCHAR -- name of input table
, out_table VARCHAR -- name of output table
, dependent_varname VARCHAR -- name of dependent variable
, input_independent_varname VARCHAR -- name of independent variable
, ref_category INTEGER -- reference category
, input_group_cols VARCHAR -- names of columns to group by
, marginal_vars INTEGER[] -- indices of variables to calculate marginal effects on
, max_iter INTEGER -- Max iterations for the logstic regression inner call
, optimizer VARCHAR -- Logistic regression optimizer
, tolerance DOUBLE PRECISION -- Tolerance
, verbose_mode BOOLEAN -- Verbose mode
)
RETURNS VOID AS $$
PythonFunction(regress, marginal, margins_mlogregr)
$$ LANGUAGE plpythonu;
-- Default Variable calls for margins_mlogregr
------------------------------------------------------------------------------
/**
* @brief Marginal effects with default variables
**/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.margins_mlogregr(
source_table VARCHAR -- name of input table
, out_table VARCHAR -- name of output table
, dependent_variable VARCHAR -- name of dependent variable
, independent_variable VARCHAR -- name of independent variable
)
RETURNS VOID AS $$
BEGIN
PERFORM MADLIB_SCHEMA.margins_mlogregr(
source_table,
out_table,
dependent_variable,
independent_variable,
0,
NULL,
NULL,
20,
'irls',
0.0001,
False);
END;
$$ LANGUAGE plpgsql VOLATILE;
/**
* @brief Marginal effects with default variables
**/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.margins_mlogregr(
source_table VARCHAR -- name of input table
, out_table VARCHAR -- name of output table
, dependent_variable VARCHAR -- name of dependent variable
, independent_variable VARCHAR -- name of independent variable
, ref_category INTEGER -- reference category
)
RETURNS VOID AS $$
BEGIN
PERFORM MADLIB_SCHEMA.margins_mlogregr(
source_table,
out_table,
dependent_variable,
independent_variable,
ref_category,
NULL,
NULL,
20,
'irls',
0.0001,
False);
END;
$$ LANGUAGE plpgsql VOLATILE;
/**
* @brief Marginal effects with default variable_names
**/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.margins_mlogregr(
source_table VARCHAR -- name of input table
, out_table VARCHAR -- name of output table
, dependent_variable VARCHAR -- name of dependent variable
, independent_variable VARCHAR -- name of independent variable
, ref_category INTEGER -- reference category
, grouping_cols VARCHAR -- name of grouping cols
)
RETURNS VOID AS $$
BEGIN
PERFORM MADLIB_SCHEMA.margins_mlogregr(
source_table,
out_table,
dependent_variable,
independent_variable,
ref_category,
grouping_cols,
NULL,
20,
'irls',
0.0001,
False);
END;
$$ LANGUAGE plpgsql VOLATILE;
/**
* @brief Marginal effects with default variable_names
**/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.margins_mlogregr(
source_table VARCHAR -- name of input table
, out_table VARCHAR -- name of output table
, dependent_variable VARCHAR -- name of dependent variable
, independent_variable VARCHAR -- name of independent variable
, ref_category INTEGER -- reference category
, grouping_cols VARCHAR -- name of grouping cols
, marginal_vars INTEGER[] -- indices of variables to calculate marginal effects on
)
RETURNS VOID AS $$
BEGIN
PERFORM MADLIB_SCHEMA.margins_mlogregr(
source_table,
out_table,
dependent_variable,
independent_variable,
ref_category,
grouping_cols,
marginal_vars,
20,
'irls',
0.0001,
False);
END;
$$ LANGUAGE plpgsql VOLATILE;
/**
* @brief Marginal effects with default variable_names
**/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.margins_mlogregr(
source_table VARCHAR -- name of input table
, out_table VARCHAR -- name of output table
, dependent_variable VARCHAR -- name of dependent variable
, independent_variable VARCHAR -- name of independent variable
, ref_category INTEGER -- reference category
, grouping_cols VARCHAR -- name of grouping cols
, marginal_vars INTEGER[] -- indices of variables to calculate marginal effects on
, max_iter INTEGER -- Max iterations for the logstic regression inner call
)
RETURNS VOID AS $$
BEGIN
PERFORM MADLIB_SCHEMA.margins_mlogregr(
source_table,
out_table,
dependent_variable,
independent_variable,
ref_category,
grouping_cols,
marginal_vars,
max_iter,
'irls',
0.0001,
False);
END;
$$ LANGUAGE plpgsql VOLATILE;
/**
* @brief Marginal effects with default variable_names
**/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.margins_mlogregr(
source_table VARCHAR -- name of input table
, out_table VARCHAR -- name of output table
, dependent_variable VARCHAR -- name of dependent variable
, independent_variable VARCHAR -- name of independent variable
, ref_category INTEGER -- reference category
, grouping_cols VARCHAR -- name of grouping cols
, marginal_vars INTEGER[] -- indices of variables to calculate marginal effects on
, max_iter INTEGER -- Max iterations for the logstic regression inner call
, optimizer VARCHAR -- Logistic regression optimizer
)
RETURNS VOID AS $$
BEGIN
PERFORM MADLIB_SCHEMA.margins_mlogregr(
source_table,
out_table,
dependent_variable,
independent_variable,
ref_category,
grouping_cols,
marginal_vars,
max_iter,
optimizer,
0.0001,
False);
END;
$$ LANGUAGE plpgsql VOLATILE;
/**
* @brief Marginal effects with default variable_names
**/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.margins_mlogregr(
source_table VARCHAR -- name of input table
, out_table VARCHAR -- name of output table
, dependent_variable VARCHAR -- name of dependent variable
, independent_variable VARCHAR -- name of independent variable
, ref_category INTEGER -- reference category
, grouping_cols VARCHAR -- name of grouping cols
, marginal_vars INTEGER[] -- indices of variables to calculate marginal effects on
, max_iter INTEGER -- Max iterations for the logstic regression inner call
, optimizer VARCHAR -- Logistic regression optimizer
, tolerance DOUBLE PRECISION -- Tolerance
)
RETURNS VOID AS $$
BEGIN
PERFORM MADLIB_SCHEMA.margins_mlogregr(
source_table,
out_table,
dependent_variable,
independent_variable,
ref_category,
grouping_cols,
marginal_vars,
max_iter,
optimizer,
tolerance,
False);
END;
$$ LANGUAGE plpgsql VOLATILE;
-- End of Default Variable calls for margins_mlogregr
------------------------------------------------------------------------------