blob: 91b01af93f2acc4e273c0d5dacce2fe5ba9c3028 [file] [log] [blame]
/* ----------------------------------------------------------------------- */
/**
*
* @file marginal.sql_in
*
* @brief SQL functions for linear regression
* @date January 2011
*
* @sa Calculates marginal effects for various regression models.
*
*//* ----------------------------------------------------------------------- */
m4_include(`SQLCommon.m4')
/**
@addtogroup grp_marginal
<div class="toc"><b>Contents</b>
<ul>
<li><a href="#logregr_train">Logistic Regression Training Function</a></li>
<li><a href="#mlogregr_train">Multinomial Logistic Regression Training Function</a></li>
<li><a href="#examples">Examples</a></li>
<li><a href="#notes">Notes</a></li>
<li><a href="#background">Technical Background</a></li>
<li><a href="#literature">Literature</a></li>
<li><a href="#related">Related Topics</a></li>
</ul>
</div>
@brief Calculates marginal effects for the coefficients in logistic and multinomial logistic regression problems.
A marginal effect (ME) or partial effect measures the effect on the
conditional mean of \f$ y \f$ of a change in one of the regressors, say
\f$X_k\f$. In the linear regression model, the ME equals the
relevant slope coefficient, greatly simplifying analysis. For nonlinear models,
specialized algorithms are required for calculating ME. The marginal effect
computed is the average of the marginal effect at every data point present in the
source table.
MADlib provides marginal effects regression functions for logistic and
multinomial logistic regressions.
@anchor logregr_train
@par Logistic Regression Training Function
<pre class="syntax">
margins_logregr( source_table,
output_table,
dependent_variable,
independent_variable,
grouping_cols,
marginal_vars,
max_iter,
optimizer,
tolerance,
verbose_mode
)
</pre>
\b Arguments
<dl class="arglist">
<dt>source_table</dt>
<dd>VARCHAR. Name of data table.</dd>
<dt>output_table</dt>
<dd>VARCHAR. Name of result table. The output table has the following columns.
<table class="output">
<tr>
<th>margins</th>
<td>DOUBLE PRECISION[]. The marginal effects.</td>
</tr>
<tr>
<th>std_err</th>
<td>DOUBLE PRECISION[]. An array of the standard errors, using the delta method.</td>
</tr>
<tr>
<th>z_stats</th>
<td>DOUBLE PRECISION[]. An array of the z-stats of the marginal effects.</td>
</tr>
<tr>
<th>p_values</th>
<td>DOUBLE PRECISION[]. An array of the Wald p-values of the marginal effects.</td>
</tr>
</table>
A summary table named <output_table>_summary is also created, which is the same as the summary table created by logregr_train function. Please refer to the documentation for logistic regression for details.
</dd>
<dt>dependent_variable</dt>
<dd>VARCHAR. Name of column for dependent variables.</dd>
<dt>independent_variable</dt>
<dd>VARCHAR. Name of column for independent variables. Can be any SQL expression that evaluates to an array.</dd>
<dt>grouping_cols (optional)</dt>
<dd>VARCHAR, default: NULL. <em>Not currently implemented. Any non-NULL value is ignored.</em> An expression list used to group the input dataset into discrete groups, running one regression per group. Similar to the SQL "GROUP BY" clause. When this value is NULL, no grouping is used and a single result model is generated.</dd>
<dt>marginal_vars (optional)</dt>
<dd>INTEGER[], default: NULL. Index list (base 1) representing the independent variables to compute marginal effects on. When NULL, computes marginal effects on all variables.</dd>
<dt>max_iter (optional)</dt>
<dd>INTEGER, default: 20. Maximum number of iterations for the logistic regression.</dd>
<dt>optimizer (optional)</dt>
<dd>VARCHAR, default: 'irls'. Optimizer to use for the logistic regression: newton/irls, cg, or igd.</dd>
<dt>tolerance (optional)</dt>
<dd>DOUBLE PRECISION, default: 1e-4. Termination criterion for logistic regression (relative).</dd>
<dt>verbose_mode (optional)</dt>
<dd>BOOLEAN, default FALSE. When TRUE, provides verbose output of the results of training. </dd>
</dl>
@anchor mlogregr_train
@par Multinomial Logistic Regression Training Function
<pre class="syntax">
margins_mlogregr( source_table,
out_table,
dependent_varname,
independent_varname,
ref_category,
grouping_cols,
marginal_vars,
optimizer_params,
verbose_mode
)
</pre>
\b Arguments
<dl class="arglist">
<dt>source_table</dt>
<dd>VARCHAR. Name of data table.</dd>
<dt>out_table</dt>
<dd>VARCHAR. Name of result table. The output table has the following columns.
<table class="output">
<tr>
<th>category</th>
<td>The category.</td>
</tr>
<tr>
<th>ref_category</th>
<td>The refererence category used for modeling.</td>
</tr>
<tr>
<th>margins</th>
<td>DOUBLE PRECISION[]. The marginal effects.</td>
</tr>
<tr>
<th>std_err</th>
<td>DOUBLE PRECISION[]. An array of the standard errors, using the delta method.</td>
</tr>
<tr>
<th>z_stats</th>
<td>DOUBLE PRECISION[]. An array of the z-stats of the marginal effects.</td>
</tr>
<tr>
<th>p_values</th>
<td>DOUBLE PRECISION[]. An array of the Wald p-values of the marginal effects.</td>
</tr>
</table>
A summary table named <out_table>_summary is also created, which is the same as the summary table created by mlogregr_train function. Please refer to the documentation for multinomial logistic regression for details.
</dd>
<dt>dependent_varname</dt>
<dd>VARCHAR. Name of column for dependent variables.</dd>
<dt>independent_varname</dt>
<dd>VARCHAR. Name of column for independent variables. Can be any SQL expression that evaluates to an array.</dd>
<dt>ref_category (optional)</dt>
<dd>INTEGER, default: 0. Reference category for the multinomial logistic regression.</dd>
<dt>grouping_cols (optional)</dt>
<dd>VARCHAR, default: NULL. <em>Not currently implemented. Any non-NULL value is ignored.</em> An expression list used to group the input dataset into discrete groups, running one regression per group. Similar to the SQL "GROUP BY" clause. When this value is NULL, no grouping is used and a single result model is generated.</dd>
<dt>marginal_vars(optional)</dt>
<dd>INTEGER[], default: NULL. Index list (base 1) representing the independent variables to compute marginal effects on. When NULL, computes marginal effects on all variables.</dd>
<dt>optimizer_params (optional)</dt>
<dd>TEXT, default: NULL, which uses the default values of optimizer parameters: max_iter=20, optimizer='newton', tolerance=1e-4. It should be a string that contains pairs of 'key=value' separated by commas.</dd>
<dt>verbose_mode (optional)</dt>
<dd>BOOLEAN, default FALSE. When TRUE, provides verbose output of the results of training. </dd>
</dl>
@anchor examples
@examp
-# View online help for the marginal effects logistic regression function.
<pre class="example">
SELECT madlib.margins_logregr();
</pre>
-# Create the sample data set.
<pre class="example">
SELECT * FROM data;
</pre>
Result:
<pre class="result">
id | second_attack | treatment | trait_anxiety
----+---------------+-----------+---------------
1 | 1 | 1 | 70
3 | 1 | 1 | 50
5 | 1 | 0 | 40
7 | 1 | 0 | 75
9 | 1 | 0 | 70
11 | 0 | 1 | 65
13 | 0 | 1 | 45
15 | 0 | 1 | 40
17 | 0 | 0 | 55
...
</pre>
-# Run the logistic regression function and then compute the marginal effects of all variables in the regression.
<pre class="example">
SELECT madlib.margins_logregr( 'patients',
'result_table',
'second_attack',
'ARRAY[1, treatment, trait_anxiety]'
);
</pre>
-# View the regression results.
<pre class="example">
\\x ON
SELECT * FROM result_table;
</pre>
Result:
<pre class="result">
margins | {-0.970665392796,-0.156214190168,0.0181587690137}
coef | {-6.36346994178179,-1.02410605239327,0.119044916668605}
std_err | {0.802871454422,0.292691682191,0.0137459874022}
z_stats | {-1.2089922832,-0.533715850748,1.32102325446}
p_values | {0.2266658, 0.5935381, 0.1864936}
</pre>
-# Run the logistic regression function and then compute the marginal effects of the first variable in the regression.
<pre class="example">
SELECT madlib.margins_logregr( 'patients',
'result_table',
'second_attack',
'ARRAY[1, treatment, trait_anxiety]',
NULL,
ARRAY[1]
);
SELECT * FROM result_table;
</pre>
Result:
<pre class="result">
margins | {-0.970665392796}
coef | {-6.36346994178179}
std_err | {0.802871454422}
z_stats | {-1.2089922832}
p_values | {0.2266658}
</pre>
-# View online help for marginal effects multinomial logistic regression.
<pre class="example">
SELECT madlib.margins_mlogregr();
</pre>
-# Create a sample data set for multinomial logistic regression. (The full dataset has three categories.)
<pre class="example">
SELECT * FROM data;
</pre>
Result:
<pre class="result">
id | feature_1 | feature_2 | category
----+---------------+-----------+---------------
1 | 1 | 35 | 1
3 | 2 | 33 | 0
5 | 3 | 39 | 1
7 | 1 | 37 | 1
9 | 2 | 31 | 1
11 | 3 | 36 | 1
13 | 2 | 36 | 1
15 | 2 | 36 | 0
17 | 2 | 31 | 5
...
</pre>
-# Get help for the marginal effects multinomial logistic regression function.
<pre class="example">
SELECT margins_mlogregr('help');
</pre>
-# Run the regression function and then compute the marginal effects of all variables in the regression.
<pre class="example">
SELECT madlib.margins_mlogregr( 'test_data',
'result_table',
'category',
'ARRAY[1, feature_1, feature_2]'
);
SELECT * FROM result_table;
</pre>
Result:
<pre class="result">
margins | {0.741613239156,-0.032868883552,-0.0144502990691,-0.972055011831,0.112337273885,0.0172621628253}
std_err | {0.183172236055,0.044184899499,0.00332608999704,0.263532615748,0.0555196094594,0.00457999429836}
z_stats | {4.04872078394,-0.743894043547,-4.34453038911,-3.68855676202,2.02338011702,3.76903587663}
p_values | {5.149834e-05, 4.569406e-01, 1.395739e-05, 2.255297e-04, 4.303397e-02, 1.638793e-04}
</pre>
-# Run the regression and compute the marginal effects for the first dependent variable (all categories).
<pre class="example">
SELECT madlib.margins_mlogregr( 'test_data',
'result_table',
'category',
'ARRAY[1, feature_1, feature_2]',
0,
NULL,
ARRAY[1]
);
SELECT * FROM result_table;
</pre>
Result:
<pre class="result">
margins | {0.741613239156,-0.972055011831}
std_err | {0.183172236055,0.263532615748}
z_stats | {4.04872078394,-3.68855676202}
p_values | {5.149834e-05, 2.255297e-04}
</pre>
@anchor notes
@par Notes
<b>Multinomial-Logistic Regression Notes</b>
- The reference category ranges from [0, numCategories - 1]. The default reference
category is zero.
- The marginal effects and supporting statistics are output in the following order.
For a problem with K dependent variables (1, ..., K) and J categories (0,..., J-1).
The output is \f$ \{ m_{k_1, j_0} \ldots m_{k_K, j_{0}},
m_{k_1, j_1} \ldots m_{k_K, j_{1}}, m_{k_1, j_{J-1}} \ldots m_{k_K, j_{J-1}} \} \f$.
The order is NOT CONSISTENT with mlogregr, robust_variance_mlogregr and clustered_variance_mlogregr.
This is deliberate; the interfaces of the other functions will be changed
to match that used in the marginal effects function.
- Selectively picking the marginal effects of the independent variables will return
the marginal effects of \e all categories associated with the independent variable.
- The <em>independent_varname</em> argument can be the name of a column that contains an
array of numeric values. It can also be a string with the format 'ARRAY[1, x1, x2, x3]',
where <em>x1</em>, <em>x2</em>, and <em>x3</em> are column names.
- The <em>vars</em> argument is an index list (base 1) with the set of indices
amongst the independent variables that must be selected.
Use <em>NULL</em> to use all independent variables.
@anchor background
@par Technical Background
The standard approach to modeling dichotomous/binary variables
(so \f$y \in \{0, 1\} \f$) is to estimate a generalized linear model under the
assumption that \f$ y \f$ follows some form of Bernoulli distribution. Thus the expected
value of \f$ y \f$ becomes,
\f[
y = G(X' \beta),
\f]
where G is the specified binomial distribution. For
logistic regression, the function \f$ G \f$ represents the inverse logit function.
In logistic regression:
\f[
P = \frac{1}{1 + e^{-(\beta_0 + \beta_1 x_1 + \dots \beta_j x_j)}}
= \frac{1}{1 + e^{-z}}
\implies \frac{\partial P}{\partial X_k} = \beta_k \cdot \frac{1}{1 + e^{-z}} \cdot
\frac{e^{-z}}{1 + e^{-z}} \\
= \beta_k \cdot P \cdot (1-P)
\f]
There are several methods for calculating the marginal effects for dichotomous
dependent variables. This package uses the average of the marginal effects at
every sample observation.
This is calculated as follows:
\f[
\frac{\partial y}{\partial x_k} = \beta_k \frac{\sum_{i=1}^n P(y_i = 1)(1-P(y_i = 1))}{n}, \\
\text{where}, P(y_i=1) = g(X^{(i)}\beta)
\f]
We use the delta method for calculating standard errors on the marginal effects.
@anchor literature
@literature
[1] mfx function in STATA: http://www.stata.com/help.cgi?mfx_option
@anchor related
@par Related Topics
File marginal.sql_in documenting the SQL functions.
@internal
@sa Namespace \ref madlib::modules::regress
documenting the implementation in C++
@endinternal
*/
------------------ Marginal Logistic Regression ------------------------------
CREATE TYPE MADLIB_SCHEMA.marginal_logregr_result AS (
margins DOUBLE PRECISION[],
coef DOUBLE PRECISION[],
std_err DOUBLE PRECISION[],
z_stats DOUBLE PRECISION[],
p_values DOUBLE PRECISION[]
);
--------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.marginal_logregr_step_transition(
state DOUBLE PRECISION[],
y BOOLEAN,
x DOUBLE PRECISION[],
coef DOUBLE PRECISION[])
RETURNS DOUBLE PRECISION[]
AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE;
--------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.marginal_logregr_step_merge_states(
state1 DOUBLE PRECISION[],
state2 DOUBLE PRECISION[])
RETURNS DOUBLE PRECISION[]
AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE STRICT;
--------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.marginal_logregr_step_final(
state DOUBLE PRECISION[])
RETURNS MADLIB_SCHEMA.marginal_logregr_result
AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE STRICT;
--------------------------------------
/**
* @brief Compute marginal effects for logistic regression.
*
* @param dependentVariable Column containing the dependent variable
* @param independentVariables Column containing the array of independent variables
* @param coef Column containing the array of the coefficients (as obtained by logregr)
*
* @par
* To include an intercept in the model, set one coordinate in the
* <tt>independentVariables</tt> array to 1.
*
* @return A composite value:
* - <tt>margins FLOAT8[] </tt> - Array of marginal effects
* - <tt>coefFLOAT8[] </tt> - The coefficients for the regression
* - <tt>std_err FLOAT8[]</tt> - Array of standard-errors (calculated by the delta method),
* - <tt>z_stats FLOAT8[]</tt> - Array of z-statistics
* - <tt>p_values FLOAT8[]</tt> - Array of p-values
*
* @usage
* - Get all the diagnostic statistics:\n
*
* <pre> SELECT marginal_logregr(<em>dependentVariable</em>,
* <em>independentVariables</em>, coef)
* FROM <em>dataTable</em>;
* </pre>
*/
CREATE AGGREGATE MADLIB_SCHEMA.marginal_logregr(
/*+ "dependentVariable" */ BOOLEAN,
/*+ "independentVariables" */ DOUBLE PRECISION[],
/*+ "coef" */ DOUBLE PRECISION[] )(
STYPE=DOUBLE PRECISION[],
SFUNC=MADLIB_SCHEMA.marginal_logregr_step_transition,
m4_ifdef(`__GREENPLUM__',`PREFUNC=MADLIB_SCHEMA.marginal_logregr_step_merge_states,')
FINALFUNC=MADLIB_SCHEMA.marginal_logregr_step_final,
INITCOND='{0,0,0,0,0,0}'
);
-------------------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.margins_logregr(
message VARCHAR
) RETURNS VARCHAR AS $$
PythonFunction(regress, marginal, margins_logregr_help)
$$ LANGUAGE plpythonu VOLATILE;
--------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.margins_logregr()
RETURNS VARCHAR AS $$
SELECT MADLIB_SCHEMA.margins_logregr('');
$$ LANGUAGE sql VOLATILE;
-------------------------------------------------------------------------------------
/**
@brief A wrapper function for the various marginal regression analyzes.
*
* @param source_table String identifying the input table
* @param out_table String identifying the output table to be created
* @param dependent_varname Column containing the dependent variable
* @param independent_varname Column containing the array of independent variables
* @param grouping_cols Set of columns to group by.
* @param marginal_vars Subset of independent variables to calculate marginal effects for.
* @param max_iter Maximum number of iterations
* @param optimzer Optimizer to be used (newton/irls, cg or idg)
* @param tolerance Resiual tolerance
* @param verbose_mode Verbose mode (on/off)
*
* @par
* To include an intercept in the model, set one coordinate in the
* <tt>independentVariables</tt> array to 1.
*
* @return void
*
* @usage
* For function summary information. Run
* sql> select margins_logregr('help');
* OR
* sql> select margins_logregr();
* OR
* sql> select margins_logregr('?');
* For function usage information. Run
* sql> select margins_logregr('usage');
* - Compute the coefficients, and the get the marginal diagnostic statistics:
* <pre>
* select margins_logregr(source_table, out_table, dependentVariable, independentVariables);
* </pre>
*
*/
-------------------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.margins_logregr(
source_table VARCHAR -- name of input table
, out_table VARCHAR -- name of output table
, dependent_varname VARCHAR -- name of dependent variable
, independent_varname VARCHAR -- name of independent variable
, grouping_cols VARCHAR -- names of columns to group by
, marginal_vars INTEGER[] -- indices of variables to calculate marginal effects on
, max_iter INTEGER -- Max iterations for the logstic regression inner call
, optimizer VARCHAR -- Logistic regression optimizer
, tolerance DOUBLE PRECISION -- Tolerance
, verbose_mode BOOLEAN -- Printing warning message or not
)
RETURNS VOID AS $$
PythonFunction(regress, marginal, margins_logregr)
$$ LANGUAGE plpythonu;
--------------------------------------
/**
* @brief Marginal effects with default variables
**/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.margins_logregr(
source_table VARCHAR -- name of input table
, out_table VARCHAR -- name of output table
, dependent_variable VARCHAR -- name of dependent variable
, independent_variable VARCHAR -- name of independent variable
)
RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.margins_logregr(
$1, $2, $3, $4, NULL, NULL, 20, 'irls', 0.0001, False);
$$ LANGUAGE sql VOLATILE;
--------------------------------------
/**
* @brief Marginal effects with default variable_names
**/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.margins_logregr(
source_table VARCHAR -- name of input table
, out_table VARCHAR -- name of output table
, dependent_variable VARCHAR -- name of dependent variable
, independent_variable VARCHAR -- name of independent variable
, grouping_cols VARCHAR -- name of grouping cols
)
RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.margins_logregr(
$1, $2, $3, $4, $5, NULL, 20, 'irls', 0.0001, False);
$$ LANGUAGE sql VOLATILE;
--------------------------------------
/**
* @brief Marginal effects with default variable_names
**/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.margins_logregr(
source_table VARCHAR -- name of input table
, out_table VARCHAR -- name of output table
, dependent_variable VARCHAR -- name of dependent variable
, independent_variable VARCHAR -- name of independent variable
, grouping_cols VARCHAR -- name of grouping cols
, marginal_vars INTEGER[] -- indices of variables to calculate marginal effects on
)
RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.margins_logregr(
$1, $2, $3, $4, $5, $6, 20, 'irls', 0.0001, False);
$$ LANGUAGE sql VOLATILE;
--------------------------------------
/**
* @brief Marginal effects with default variable_names
**/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.margins_logregr(
source_table VARCHAR -- name of input table
, out_table VARCHAR -- name of output table
, dependent_variable VARCHAR -- name of dependent variable
, independent_variable VARCHAR -- name of independent variable
, grouping_cols VARCHAR -- name of grouping cols
, marginal_vars INTEGER[] -- indices of variables to calculate marginal effects on
, max_iter INTEGER -- Max iterations for the logstic regression inner call
)
RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.margins_logregr(
$1, $2, $3, $4, $5, $6, $7, 'irls', 0.0001, False);
$$ LANGUAGE sql VOLATILE;
--------------------------------------
/**
* @brief Marginal effects with default variable_names
**/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.margins_logregr(
source_table VARCHAR -- name of input table
, out_table VARCHAR -- name of output table
, dependent_variable VARCHAR -- name of dependent variable
, independent_variable VARCHAR -- name of independent variable
, grouping_cols VARCHAR -- name of grouping cols
, marginal_vars INTEGER[] -- indices of variables to calculate marginal effects on
, max_iter INTEGER -- Max iterations for the logstic regression inner call
, optimizer VARCHAR -- Logistic regression optimizer
)
RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.margins_logregr(
$1, $2, $3, $4, $5, $6, $7, $8, 0.0001, False);
$$ LANGUAGE sql VOLATILE;
--------------------------------------
/**
* @brief Marginal effects with default variable_names
**/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.margins_logregr(
source_table VARCHAR -- name of input table
, out_table VARCHAR -- name of output table
, dependent_variable VARCHAR -- name of dependent variable
, independent_variable VARCHAR -- name of independent variable
, grouping_cols VARCHAR -- name of grouping cols
, marginal_vars INTEGER[] -- indices of variables to calculate marginal effects on
, max_iter INTEGER -- Max iterations for the logstic regression inner call
, optimizer VARCHAR -- Logistic regression optimizer
, tolerance DOUBLE PRECISION -- Tolerance
)
RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.margins_logregr(
$1, $2, $3, $4, $5, $6, $7, $8, $9, False);
$$ LANGUAGE sql VOLATILE;
-- End of Default Variable calls for margins_logregr
------------------------------------------------------------------------------
------------------ Marginal Multi-Logistic Regression ------------------------------
CREATE TYPE MADLIB_SCHEMA.marginal_mlogregr_result AS (
margins DOUBLE PRECISION[],
coef DOUBLE PRECISION[],
std_err DOUBLE PRECISION[],
z_stats DOUBLE PRECISION[],
p_values DOUBLE PRECISION[]
);
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlogregr_marginal_step_transition(
state DOUBLE PRECISION[],
y INTEGER,
num_categories INTEGER,
ref_category INTEGER,
x DOUBLE PRECISION[],
coef DOUBLE PRECISION[])
RETURNS DOUBLE PRECISION[]
AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE;
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlogregr_marginal_step_merge_states(
state1 DOUBLE PRECISION[],
state2 DOUBLE PRECISION[])
RETURNS DOUBLE PRECISION[]
AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE STRICT;
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlogregr_marginal_step_final(
state DOUBLE PRECISION[])
RETURNS MADLIB_SCHEMA.marginal_mlogregr_result
AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE STRICT;
/**
* @brief Compute marginal effects for multinomial logistic regression.
*
* @param dependentVariable Column containing the dependent variable
* @param independentVariables Column containing the array of independent variables
* @param num_categories Number of categories
* @param ref_category Reference category
* @param coef Column containing the array of the coefficients (as obtained by mlogregr)
*
* @par
* To include an intercept in the model, set one coordinate in the
* <tt>independentVariables</tt> array to 1.
*
* @return A composite value:
* - <tt>margins FLOAT8[] </tt> - Array of marginal effects
* - <tt>coef FLOAT8[] </tt> - The coefficients for the regression
* - <tt>std_err FLOAT8[]</tt> - Array of standard-errors (using the delta method),
* - <tt>z_stats FLOAT8[]</tt> - Array of z-statistics
* - <tt>p_values FLOAT8[]</tt> - Array of p-values
*
* @usage
* - Get all the diagnostic statistics:\n
*
* <pre> SELECT marginal_mlogregr(<em>dependentVariable</em>,
* <em>independentVariables</em>, coef)
* FROM <em>dataTable</em>;
* </pre>
*/
CREATE AGGREGATE MADLIB_SCHEMA.marginal_mlogregr(
/*+ "dependentVariable" */ INTEGER,
/*+ "num_categories" */ INTEGER,
/*+ "ref_category" */ INTEGER,
/*+ "independentVariables" */ DOUBLE PRECISION[],
/*+ "coef" */ DOUBLE PRECISION[] )(
STYPE=DOUBLE PRECISION[],
SFUNC=MADLIB_SCHEMA.mlogregr_marginal_step_transition,
m4_ifdef(`__GREENPLUM__',`PREFUNC=MADLIB_SCHEMA.mlogregr_marginal_step_merge_states,')
FINALFUNC=MADLIB_SCHEMA.mlogregr_marginal_step_final,
INITCOND='{0,0,0,0,0,0}'
);
--------------------------- Interface ----------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.margins_mlogregr(
message VARCHAR
)
RETURNS VARCHAR AS $$
PythonFunction(regress, marginal, margins_mlogregr_help)
$$ LANGUAGE plpythonu;
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.margins_mlogregr()
RETURNS VARCHAR AS $$
SELECT MADLIB_SCHEMA.margins_mlogregr('');
$$ LANGUAGE SQL;
/**
@brief A wrapper function for the various marginal multinomial regression
*
* @param source_table String identifying the input table
* @param out_table String identifying the output table to be created
* @param dependent_varname Column containing the dependent variable
* @param independent_varname Column containing the array of independent variables
* @param ref_category Reference category for the multinomial logistic regression
* @param grouping_cols Set of columns to group by.
* @param marginal_vars Subset of independent variables to calculate marginal effects for.
* @param max_iter Maximum number of iterations
* @param optimzer Optimizer to be used (newton/irls, cg or idg)
* @param tolerance Resiual tolerance
* @param verbose_mode Verbose mode (on/off)
*
* @par
* To include an intercept in the model, set one coordinate in the
* <tt>independentVariables</tt> array to 1.
*
* @return void
*
* @usage
* For function summary information. Run
* sql> select margins_mlogregr('help');
* OR
* sql> select margins_mlogregr();
* OR
* sql> select margins_mlogregr('?');
* For function usage information. Run
* sql> select margins_mlogregr('usage');
*
*/
/*
-- NOTICE ---------------------------------------------------------------
-- All functions calling other modeling functions should end up in below format:
-- accept a model table created by the underlying modeling function
-- and extract all parameters from the model table and summary table
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.margins_mlogregr(
model_table VARCHAR -- name of the mlogregr model table
, out_table VARCHAR -- name of output table
, marginal_vars INTEGER[] -- indices of variables to calculate marginal effects on
)
RETURNS VOID AS $$
PythonFunction(regress, marginal, margins_mlogregr_new)
$$ LANGUAGE plpythonu;
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.margins_mlogregr(
model_table VARCHAR -- name of the mlogregr model table
, output_table VARCHAR -- name of output table
)
RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.margins_mlogregr($1, $2, NULL::INTEGER[])
$$ LANGUAGE SQL;
-- END OF NOTICE ---------------------------------------------------------------
*/
--------------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.margins_mlogregr(
source_table VARCHAR -- name of input table
, out_table VARCHAR -- name of output table
, dependent_varname VARCHAR -- name of dependent variable
, independent_varname VARCHAR -- name of independent variable
, ref_category INTEGER -- reference category
, grouping_cols VARCHAR -- names of columns to group by
, marginal_vars INTEGER[] -- indices of variables to calculate marginal effects on
, optimizer_params VARCHAR -- a comma-separated string with optimizer parameters
, verbose_mode BOOLEAN -- Verbose mode
)
RETURNS VOID AS $$
PythonFunction(regress, marginal, margins_mlogregr_main)
$$ LANGUAGE plpythonu;
/**
* @brief Marginal effects with default variable_names
**/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.margins_mlogregr(
source_table VARCHAR -- name of input table
, out_table VARCHAR -- name of output table
, dependent_varname VARCHAR -- name of dependent variable
, independent_varname VARCHAR -- name of independent variable
, ref_category INTEGER -- reference category
, grouping_cols VARCHAR -- names of columns to group by
, marginal_vars INTEGER[] -- indices of variables to calculate marginal effects on
, optimizer_params VARCHAR -- a comma-separated string with optimizer parameters
)
RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.margins_mlogregr($1, $2, $3, $4, $5, $6, $7, $8, FALSE);
$$ LANGUAGE sql VOLATILE;
-- DEPRECATED NOTICE -----------------------------------------------------------
-- The below functions has been deprecated and should be removed in next major
-- version update
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.margins_mlogregr(
source_table VARCHAR -- name of input table
, out_table VARCHAR -- name of output table
, dependent_varname VARCHAR -- name of dependent variable
, independent_varname VARCHAR -- name of independent variable
, ref_category INTEGER -- reference category
, grouping_cols VARCHAR -- names of columns to group by
, marginal_vars INTEGER[] -- indices of variables to calculate marginal effects on
, max_iter INTEGER -- Max iterations for the logstic regression inner call
, optimizer VARCHAR -- Logistic regression optimizer
, tolerance DOUBLE PRECISION -- Tolerance
, verbose_mode BOOLEAN -- Verbose mode
)
RETURNS VOID AS $$
DECLARE
max_iter_string VARCHAR;
optimizer_string VARCHAR;
tolerance_string VARCHAR;
BEGIN
IF max_iter is NULL THEN
max_iter_string := 'null';
ELSE
max_iter_string := max_iter::text;
END IF;
IF optimizer is NULL THEN
optimizer_string := 'null';
ELSE
optimizer_string := optimizer::text;
END IF;
IF tolerance is NULL THEN
tolerance_string := 'null';
ELSE
tolerance_string := tolerance::text;
END IF;
PERFORM MADLIB_SCHEMA.margins_mlogregr(
source_table,
out_table,
dependent_varname,
independent_varname,
ref_category,
grouping_cols,
marginal_vars,
'max_iter=' || max_iter_string ||
', optimizer=' || optimizer_string ||
', tolerance=' || tolerance_string,
verbose_mode);
END
$$ LANGUAGE plpgsql VOLATILE;
/**
* @brief Marginal effects with default variable_names
**/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.margins_mlogregr(
source_table VARCHAR -- name of input table
, out_table VARCHAR -- name of output table
, dependent_variable VARCHAR -- name of dependent variable
, independent_variable VARCHAR -- name of independent variable
, ref_category INTEGER -- reference category
, grouping_cols VARCHAR -- name of grouping cols
, marginal_vars INTEGER[] -- indices of variables to calculate marginal effects on
, max_iter INTEGER -- Max iterations for the logstic regression inner call
)
RETURNS VOID AS $$
BEGIN
PERFORM MADLIB_SCHEMA.margins_mlogregr(
source_table,
out_table,
dependent_variable,
independent_variable,
ref_category,
grouping_cols,
marginal_vars,
max_iter,
'irls',
0.0001,
False);
END
$$ LANGUAGE plpgsql VOLATILE;
/**
* @brief Marginal effects with default variable_names
**/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.margins_mlogregr(
source_table VARCHAR -- name of input table
, out_table VARCHAR -- name of output table
, dependent_variable VARCHAR -- name of dependent variable
, independent_variable VARCHAR -- name of independent variable
, ref_category INTEGER -- reference category
, grouping_cols VARCHAR -- name of grouping cols
, marginal_vars INTEGER[] -- indices of variables to calculate marginal effects on
, max_iter INTEGER -- Max iterations for the logstic regression inner call
, optimizer VARCHAR -- Logistic regression optimizer
)
RETURNS VOID AS $$
BEGIN
PERFORM MADLIB_SCHEMA.margins_mlogregr(
source_table,
out_table,
dependent_variable,
independent_variable,
ref_category,
grouping_cols,
marginal_vars,
max_iter,
optimizer,
0.0001,
False);
END
$$ LANGUAGE plpgsql VOLATILE;
/**
* @brief Marginal effects with default variable_names
**/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.margins_mlogregr(
source_table VARCHAR -- name of input table
, out_table VARCHAR -- name of output table
, dependent_variable VARCHAR -- name of dependent variable
, independent_variable VARCHAR -- name of independent variable
, ref_category INTEGER -- reference category
, grouping_cols VARCHAR -- name of grouping cols
, marginal_vars INTEGER[] -- indices of variables to calculate marginal effects on
, max_iter INTEGER -- Max iterations for the logstic regression inner call
, optimizer VARCHAR -- Logistic regression optimizer
, tolerance DOUBLE PRECISION -- Tolerance
)
RETURNS VOID AS $$
BEGIN
PERFORM MADLIB_SCHEMA.margins_mlogregr(
source_table,
out_table,
dependent_variable,
independent_variable,
ref_category,
grouping_cols,
marginal_vars,
max_iter,
optimizer,
tolerance,
False);
END
$$ LANGUAGE plpgsql VOLATILE;
-- End of Default Variable calls for margins_mlogregr
------------------------------------------------------------------------------
-- END OF DEPRECATED NOTICE -----------------------------------------------------------
-- Default calls for margins_mlogregr (Overloaded functions)
------------------------------------------------------------------------------
/**
* @brief Marginal effects with default variables
**/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.margins_mlogregr(
source_table VARCHAR -- name of input table
, out_table VARCHAR -- name of output table
, dependent_variable VARCHAR -- name of dependent variable
, independent_variable VARCHAR -- name of independent variable
)
RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.margins_mlogregr($1, $2, $3, $4, 0, NULL::VARCHAR,
NULL::INTEGER[],
'max_iter=20, optimizer=irls, tolerance=0.0001',
False);
$$ LANGUAGE sql VOLATILE;
/**
* @brief Marginal effects with default variables
**/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.margins_mlogregr(
source_table VARCHAR -- name of input table
, out_table VARCHAR -- name of output table
, dependent_variable VARCHAR -- name of dependent variable
, independent_variable VARCHAR -- name of independent variable
, ref_category INTEGER -- reference category
)
RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.margins_mlogregr($1, $2, $3, $4, $5, NULL::VARCHAR,
NULL::INTEGER[],
'max_iter=20, optimizer=irls, tolerance=0.0001',
False);
$$ LANGUAGE sql VOLATILE;
/**
* @brief Marginal effects with default variable_names
**/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.margins_mlogregr(
source_table VARCHAR -- name of input table
, out_table VARCHAR -- name of output table
, dependent_variable VARCHAR -- name of dependent variable
, independent_variable VARCHAR -- name of independent variable
, ref_category INTEGER -- reference category
, grouping_cols VARCHAR -- name of grouping cols
)
RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.margins_mlogregr($1, $2, $3, $4, $5, $6, NULL::INTEGER[],
'max_iter=20, optimizer=irls, tolerance=0.0001',
False);
$$ LANGUAGE sql VOLATILE;
/**
* @brief Marginal effects with default variable_names
**/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.margins_mlogregr(
source_table VARCHAR -- name of input table
, out_table VARCHAR -- name of output table
, dependent_variable VARCHAR -- name of dependent variable
, independent_variable VARCHAR -- name of independent variable
, ref_category INTEGER -- reference category
, grouping_cols VARCHAR -- name of grouping cols
, marginal_vars INTEGER[] -- indices of variables to calculate marginal effects on
)
RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.margins_mlogregr($1, $2, $3, $4, $5, $6, $7,
'max_iter=20, optimizer=irls, tolerance=0.0001',
False);
$$ LANGUAGE sql VOLATILE;
-- End of Default Variable calls for margins_mlogregr
------------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__sub_array(
value_array DOUBLE PRECISION[], -- The array containing values to be selected
index_array INTEGER[] -- The array containing the index
) RETURNS DOUBLE PRECISION[]
AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE;