blob: 99f3089efdabcf5a66ebed9137bda5282b611024 [file] [log] [blame]
/* ----------------------------------------------------------------------- *//**
*
* @file multilogistic.sql_in
*
* @brief SQL functions for multinomial logistic regression
* @date July 2012
*
* @sa For a brief introduction to multinomial logistic regression, see the
* module description \ref grp_mlogreg.
*
*//* ----------------------------------------------------------------------- */
m4_include(`SQLCommon.m4')
/**
@addtogroup grp_mlogreg
@about
Multinomial logistic regression is a widely used regression analysis tool that models the outcomes of categorical dependent
random variables (denoted \f$ Y \in \{ 0,1,2 \ldots k \} \f$). The models assumes that the conditional mean of the
dependant categorical variables is the logistic function of an affine combination of independent
variables (usually denoted \f$ \boldsymbol x \f$). That is,
\f[
E[Y \mid \boldsymbol x] = \sigma(\boldsymbol c^T \boldsymbol x)
\f]
for some unknown vector of coefficients \f$ \boldsymbol c \f$ and where
\f$ \sigma(x) = \frac{1}{1 + \exp(-x)} \f$ is the logistic function. Multinomial Logistic
regression finds the vector of coefficients \f$ \boldsymbol c \f$ that maximizes
the likelihood of the observations.
Let
- \f$ \boldsymbol y \in \{ 0,1 \}^{n \times k} \f$ denote the vector of observed dependent
variables, with \f$ n \f$ rows and \f$ k \f$ columns, containing the observed values of the
dependent variable,
- \f$ X \in \mathbf R^{n \times k} \f$ denote the design matrix with \f$ k \f$
columns and \f$ n \f$ rows, containing all observed vectors of independent
variables \f$ \boldsymbol x_i \f$ as rows.
By definition,
\f[
P[Y = y_i | \boldsymbol x_i]
= \sigma((-1)^{y_i} \cdot \boldsymbol c^T \boldsymbol x_i)
\,.
\f]
Maximizing the likelihood
\f$ \prod_{i=1}^n \Pr(Y = y_i \mid \boldsymbol x_i) \f$
is equivalent to maximizing the log-likelihood
\f$ \sum_{i=1}^n \log \Pr(Y = y_i \mid \boldsymbol x_i) \f$, which simplifies to
\f[
l(\boldsymbol c) =
-\sum_{i=1}^n \log(1 + \exp((-1)^{y_i}
\cdot \boldsymbol c^T \boldsymbol x_i))
\,.
\f]
The Hessian of this objective is \f$ H = -X^T A X \f$ where
\f$ A = \text{diag}(a_1, \dots, a_n) \f$ is the diagonal matrix with
\f$
a_i = \sigma(\boldsymbol c^T \boldsymbol x)
\cdot
\sigma(-\boldsymbol c^T \boldsymbol x)
\,.
\f$
Since \f$ H \f$ is non-positive definite, \f$ l(\boldsymbol c) \f$ is convex.
There are many techniques for solving convex optimization problems. Currently,
logistic regression in MADlib can use:
- Iteratively Reweighted Least Squares
We estimate the standard error for coefficient \f$ i \f$ as
\f[
\mathit{se}(c_i) = \left( (X^T A X)^{-1} \right)_{ii}
\,.
\f]
The Wald z-statistic is
\f[
z_i = \frac{c_i}{\mathit{se}(c_i)}
\,.
\f]
The Wald \f$ p \f$-value for coefficient \f$ i \f$ gives the probability (under
the assumptions inherent in the Wald test) of seeing a value at least as extreme
as the one observed, provided that the null hypothesis (\f$ c_i = 0 \f$) is
true. Letting \f$ F \f$ denote the cumulative density function of a standard
normal distribution, the Wald \f$ p \f$-value for coefficient \f$ i \f$ is
therefore
\f[
p_i = \Pr(|Z| \geq |z_i|) = 2 \cdot (1 - F( |z_i| ))
\f]
where \f$ Z \f$ is a standard normally distributed random variable.
The odds ratio for coefficient \f$ i \f$ is estimated as \f$ \exp(c_i) \f$.
The condition number is computed as \f$ \kappa(X^T A X) \f$ during the iteration
immediately <em>preceding</em> convergence (i.e., \f$ A \f$ is computed using
the coefficients of the previous iteration). A large condition number (say, more
than 1000) indicates the presence of significant multicollinearity.
The multinomial logistic regression uses a default reference category of zero,
and the regression coefficients in the output are in the order described below.
For a problem with
\f$ K \f$ dependent variables \f$ (1, ..., K) \f$ and \f$ J \f$ categories \f$ (0, ..., J-1)
\f$, let \f$ {m_{k,j}} \f$ denote the coefficient for dependent variable \f$ k
\f$ and category \f$ j \f$. The output is \f$ {m_{k_1, j_0}, m_{k_1, j_1}
\ldots m_{k_1, j_{J-1}}, m_{k_2, j_0}, m_{k_2, j_1}, \ldots m_{k_2, j_{J-1}} \ldots m_{k_K, j_{J-1}}} \f$.
The order is NOT CONSISTENT with the multinomial regression marginal effect
calculation with function <em>marginal_mlogregr</em>. This is deliberate
because the interfaces of all multinomial regressions (robust, clustered, ...)
will be moved to match that used in marginal.
@input
The training data is expected to be of the following form:\n
<pre>{TABLE|VIEW} <em>sourceName</em> (
...
<em>dependentVariable</em> INTEGER,
<em>independentVariables</em> FLOAT8[],
...
)</pre>
@usage
- The number of independent variables cannot exceed 65535.
- The reference category ranges from [0, numCategories-1]. The default reference
category is zero.
- Get vector of coefficients \f$ \boldsymbol c \f$ and all diagnostic
statistics:\n
<pre>SELECT * FROM \ref mlogregr(
'<em>sourceName</em>',
'<em>dependentVariable</em>',
'<em>independentVariables</em>' [,
<em>numberOfIterations</em> [,
'<em>optimizer</em>' [,
<em>precision</em>, [,
<em>ref_category</em>]]]]
);</pre>
Output:
<pre>
ref_category | coef | log_likelihood | std_err | z_stats | p_values | odds_ratios | condition_no | num_iterations\n ----+------+----------------+---------+---------+----------+-------------+--------------+---------------
...
</pre>
- Get vector of coefficients \f$ \boldsymbol c \f$:\n
<pre>SELECT (\ref mlogregr('<em>sourceName</em>', '<em>dependentVariable</em>', '<em>independentVariables</em>')).coef;</pre>
- Get a subset of the output columns, e.g., only the array of coefficients
\f$ \boldsymbol c \f$, the log-likelihood of determination
\f$ l(\boldsymbol c) \f$, and the array of p-values \f$ \boldsymbol p \f$:
<pre>SELECT coef, log_likelihood, p_values
FROM \ref mlogregr('<em>sourceName</em>', '<em>dependentVariable</em>', '<em>independentVariables</em>');</pre>
Note that the categories are encoded as integers with values from {0, 1, 2,..., numCategories-1}
@examp
-# Create the sample data set:
@verbatim
sql> SELECT * FROM data;
r1 | val
---------------------------------------------+-----
{1,3.01789340097457,0.454183579888195} | 1
{1,-2.59380532894284,0.602678326424211} | 0
{1,-1.30643094424158,0.151587064377964} | 1
{1,3.60722299199551,0.963550757616758} | 1
{1,-1.52197745628655,0.0782248834148049} | 1
{1,-4.8746574902907,0.345104880165309} | 0
...
@endverbatim
-# Run the multi-logistic regression function:
@verbatim
sql> \x on
Expanded display is off.
sql> SELECT * FROM mlogregr('data', 'val', '2', 'r1', 100, 'irls', 0.001);
-[ RECORD 1 ]--+--------------------------------------------------------------
coef | {5.59049410898112,2.11077546770772,-0.237276684606453}
log_likelihood | -467.214718489873
std_err | {0.318943457652178,0.101518723785383,0.294509929481773}
z_stats | {17.5281667482197,20.7919819024719,-0.805666162169712}
p_values | {8.73403463417837e-69,5.11539430631541e-96,0.420435365338518}
odds_ratios | {267.867942976278,8.2546400100702,0.788773016471171}
condition_no | 179.186118573205
num_iterations | 9
@endverbatim
@literature
A collection of nice write-ups, with valuable pointers into
further literature:
[1] Annette J . Dobson: An Introduction to Generalized Linear Models, Second Edition. Nov 2001
[2] Cosma Shalizi: Statistics 36-350: Data Mining, Lecture Notes, 18 November
2009, http://www.stat.cmu.edu/~cshalizi/350/lectures/26/lecture-26.pdf
[3] Srikrishna Sridhar, Mark Wellons, Caleb Welton: Multilogistic Regression:
Notes and References, Jul 12 2012, http://www.cs.wisc.edu/~srikris/mlogit.pdf
[4] Scott A. Czepiel: Maximum Likelihood Estimation
of Logistic Regression Models: Theory and Implementation,
Retrieved Jul 12 2012, http://czep.net/stat/mlelr.pdf
@sa File multilogistic.sql_in (documenting the SQL functions)
@internal
@sa Namespace multilogistic (documenting the driver/outer loop implemented in
Python), Namespace
\ref madlib::modules::regress documenting the implementation in C++
@endinternal
*/
DROP TYPE IF EXISTS MADLIB_SCHEMA.mlogregr_result;
CREATE TYPE MADLIB_SCHEMA.mlogregr_result AS
(
ref_category INTEGER,
coef DOUBLE PRECISION[],
log_likelihood DOUBLE PRECISION,
std_err DOUBLE PRECISION[],
z_stats DOUBLE PRECISION[],
p_values DOUBLE PRECISION[],
odds_ratios DOUBLE PRECISION[],
condition_no DOUBLE PRECISION,
num_iterations INTEGER
);
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__mlogregr_irls_step_transition
(
state DOUBLE PRECISION[],
y INTEGER,
num_categories INTEGER,
ref_category INTEGER,
x DOUBLE PRECISION[],
prev_state DOUBLE PRECISION[]
)
RETURNS DOUBLE PRECISION[]
AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE;
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__mlogregr_irls_step_merge_states
(
state1 DOUBLE PRECISION[],
state2 DOUBLE PRECISION[]
)
RETURNS DOUBLE PRECISION[]
AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE STRICT;
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__mlogregr_irls_step_final
(
state DOUBLE PRECISION[]
)
RETURNS DOUBLE PRECISION[]
AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE STRICT;
/**
* @internal
* @brief Perform one iteration of the iteratively-reweighted-least-squares
* method for computing linear regression
*/
CREATE AGGREGATE MADLIB_SCHEMA.__mlogregr_irls_step(
/*+ y */ INTEGER,
/*+ numCategories */ INTEGER,
/*+ ref_category */ INTEGER,
/*+ x */ DOUBLE PRECISION[],
/*+ previous_state */ DOUBLE PRECISION[]) (
STYPE=DOUBLE PRECISION[],
SFUNC=MADLIB_SCHEMA.__mlogregr_irls_step_transition,
m4_ifdef(`__GREENPLUM__',`prefunc=MADLIB_SCHEMA.__mlogregr_irls_step_merge_states,')
FINALFUNC=MADLIB_SCHEMA.__mlogregr_irls_step_final,
INITCOND='{0,0,0,0,0}'
);
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__internal_mlogregr_irls_step_distance(
/*+ state1 */ DOUBLE PRECISION[],
/*+ state2 */ DOUBLE PRECISION[])
RETURNS DOUBLE PRECISION AS
'MODULE_PATHNAME'
LANGUAGE c IMMUTABLE STRICT;
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__internal_mlogregr_irls_result(
/*+ state */ DOUBLE PRECISION[])
RETURNS MADLIB_SCHEMA.mlogregr_result AS
'MODULE_PATHNAME'
LANGUAGE c IMMUTABLE STRICT;
-- We only need to document the last one (unfortunately, in Greenplum we have to
-- use function overloading instead of default arguments).
CREATE FUNCTION MADLIB_SCHEMA.__compute_mlogregr
(
source VARCHAR,
depvar VARCHAR,
indepvar VARCHAR,
numcategories INTEGER,
maxnumiterations INTEGER,
optimizer VARCHAR,
"precision" DOUBLE PRECISION,
ref_category INTEGER
)
RETURNS INTEGER
AS $$
PythonFunction(regress, multilogistic, compute_mlogregr)
$$ LANGUAGE plpythonu VOLATILE STRICT;
/**
* @brief Compute logistic-regression coefficients and diagnostic statistics
*
* To include an intercept in the model, set one coordinate in the
* <tt>independentVariables</tt> array to 1.
*
* @param source Name of the source relation containing the training data
* @param depvar Name of the dependent column (of type INTEGER < numcategories)
* @param indepvar Name of the independent column (of type DOUBLE PRECISION[])
* @param maxnumiterations The maximum number of iterations
* @param optimizer The optimizer to use (
* <tt>'irls'</tt>/<tt>'newton'</tt> for iteratively reweighted least
* squares)
* @param precision The difference between log-likelihood values in successive
* iterations that should indicate convergence. Note that a non-positive
* value here disables the convergence criterion, and execution will only
* stop after \ maxNumIterations iterations.
* @param ref_category The reference category specified by the user
*
* @return A composite value:
* - <tt>ref_category INTEGER</tt> - Reference category
* - <tt>coef FLOAT8[]</tt> - Array of coefficients, \f$ \boldsymbol c \f$
* - <tt>log_likelihood FLOAT8</tt> - Log-likelihood \f$ l(\boldsymbol c) \f$
* - <tt>std_err FLOAT8[]</tt> - Array of standard errors,
* \f$ \mathit{se}(c_1), \dots, \mathit{se}(c_k) \f$
* - <tt>z_stats FLOAT8[]</tt> - Array of Wald z-statistics, \f$ \boldsymbol z \f$
* - <tt>p_values FLOAT8[]</tt> - Array of Wald p-values, \f$ \boldsymbol p \f$
* - <tt>odds_ratios FLOAT8[]</tt>: Array of odds ratios,
* \f$ \mathit{odds}(c_1), \dots, \mathit{odds}(c_k) \f$
* - <tt>condition_no FLOAT8</tt> - The condition number of matrix
* \f$ X^T A X \f$ during the iteration immediately <em>preceding</em>
* convergence (i.e., \f$ A \f$ is computed using the coefficients of the
* previous iteration)
* - <tt>num_iterations INTEGER</tt> - The number of iterations before the
* algorithm terminated
*
* @usage
* - Get vector of coefficients \f$ \boldsymbol c \f$ and all diagnostic
* statistics:\n
* <pre>SELECT * FROM mlogregr('<em>sourceName</em>', '<em>dependentVariable</em>',
* '<em>numCategories</em>', '<em>independentVariables</em>');</pre>
* - Get vector of coefficients \f$ \boldsymbol c \f$:\n
* <pre>SELECT (mlogregr('<em>sourceName</em>', '<em>dependentVariable</em>',
* '<em>numCategories</em>', '<em>independentVariables</em>')).coef;</pre>
* - Get a subset of the output columns, e.g., only the array of coefficients
* \f$ \boldsymbol c \f$, the log-likelihood of determination
* \f$ l(\boldsymbol c) \f$, and the array of p-values \f$ \boldsymbol p \f$:
* <pre>SELECT coef, log_likelihood, p_values
* FROM mlogregr('<em>sourceName</em>', '<em>dependentVariable</em>',
* '<em>numCategories</em>', '<em>independentVariables</em>');</pre>
*
* @note This function starts an iterative algorithm. It is not an aggregate
* function. Source and column names have to be passed as strings (due to
* limitations of the SQL syntax).
*
* @internal
* @sa This function is a wrapper for multilogistic::__compute_mlogregr(), which
* sets the default values.
*/
CREATE FUNCTION MADLIB_SCHEMA.mlogregr
(
source VARCHAR,
depvar VARCHAR,
indepvar VARCHAR,
maxnumiterations INTEGER /*+ DEFAULT 20 */,
optimizer VARCHAR /*+ DEFAULT 'irls' */,
"precision" DOUBLE PRECISION /*+ DEFAULT 0.0001 */,
ref_category INTEGER
)
RETURNS MADLIB_SCHEMA.mlogregr_result AS $$
DECLARE
observed_count INTEGER;
theIteration INTEGER;
fnName VARCHAR;
theResult MADLIB_SCHEMA.mlogregr_result;
numcategories INTEGER;
min_category INTEGER;
max_category INTEGER;
BEGIN
IF (source IS NULL OR trim(source) = '') THEN
RAISE EXCEPTION 'Invalid source table given';
END IF;
IF (depvar IS NULL OR trim(depvar) = '') THEN
RAISE EXCEPTION 'Invalid depvar given';
END IF;
IF (indepvar IS NULL OR trim(indepvar) = '') THEN
RAISE EXCEPTION 'Invalid indepvar given';
END IF;
IF (maxnumiterations IS NULL OR maxnumiterations < 1) THEN
RAISE EXCEPTION 'Number of max iterations must be positive';
END IF;
IF (optimizer IS NULL OR trim(optimizer) = '') THEN
RAISE EXCEPTION 'Invalid optimizer given';
END IF;
IF (precision IS NULL) THEN
RAISE EXCEPTION 'Invalid precision given.';
END IF;
IF (ref_category IS NULL OR ref_category < 0) THEN
RAISE EXCEPTION 'Invalid ref_category given.';
END IF;
IF (SELECT atttypid::regtype <> 'INTEGER'::regtype
FROM pg_attribute
WHERE attrelid = source::regclass AND attname = depvar) THEN
RAISE EXCEPTION 'The dependent variable column should be of type INTEGER';
END IF;
EXECUTE $sql$ SELECT count(DISTINCT $sql$ || depvar || $sql$ )
FROM $sql$ || textin(regclassout(source))
INTO observed_count;
numcategories := observed_count;
EXECUTE $sql$ SELECT max($sql$ || depvar || $sql$ )
FROM $sql$ || textin(regclassout(source))
INTO max_category;
EXECUTE $sql$ SELECT min($sql$ || depvar || $sql$ )
FROM $sql$ || textin(regclassout(source))
INTO min_category;
IF max_category != numcategories - 1 OR min_category != 0 THEN
RAISE EXCEPTION 'The value of the dependent variable should be in the
range of [0, %)', numcategories;
END IF;
IF ref_category > numcategories -1 OR ref_category < 0 THEN
RAISE EXCEPTION 'The value of the reference category should be in the
range of [0, "%")', numcategories;
END IF;
IF optimizer = 'irls' OR optimizer = 'newton' THEN
fnName := '__internal_mlogregr_irls_result';
ELSE
RAISE EXCEPTION 'Unknown optimizer (''%''). Must be "newton" or "irls"', optimizer;
END IF;
theIteration := (
SELECT MADLIB_SCHEMA.__compute_mlogregr(
$1, $2, $3, numcategories, $4, $5, $6, $7)
);
-- Because of Greenplum bug MPP-10050, we have to use dynamic SQL (using
-- EXECUTE) in the following
-- Because of Greenplum bug MPP-6731, we have to hide the tuple-returning
-- function in a subquery
EXECUTE
$sql$
SELECT (result).*
FROM (
SELECT
MADLIB_SCHEMA.$sql$ || fnName || $sql$(_madlib_state) AS result
FROM _madlib_iterative_alg
WHERE _madlib_iteration = $sql$ || theIteration || $sql$
) subq
$sql$
INTO theResult;
-- The number of iterations are not updated in the C++ code. We do it here.
IF NOT (theResult IS NULL) THEN
theResult.num_iterations = theIteration;
END IF;
RETURN theResult;
END;
$$ LANGUAGE plpgsql VOLATILE;
CREATE FUNCTION MADLIB_SCHEMA.mlogregr
(
source VARCHAR,
depvar VARCHAR,
indepvar VARCHAR
)
RETURNS MADLIB_SCHEMA.mlogregr_result AS
$$
SELECT MADLIB_SCHEMA.mlogregr($1, $2, $3, 20, 'irls', 0.0001, 0);
$$ LANGUAGE sql VOLATILE;
CREATE FUNCTION MADLIB_SCHEMA.mlogregr(
source VARCHAR,
depvar VARCHAR,
indepvar VARCHAR,
maxnumiterations INTEGER
)
RETURNS MADLIB_SCHEMA.mlogregr_result AS
$$
SELECT MADLIB_SCHEMA.mlogregr($1, $2, $3, $4, 'irls', 0.0001, 0);
$$ LANGUAGE sql VOLATILE;
CREATE FUNCTION MADLIB_SCHEMA.mlogregr(
source VARCHAR,
depvar VARCHAR,
indepvar VARCHAR,
maxbumiterations INTEGER,
optimizer VARCHAR
)
RETURNS MADLIB_SCHEMA.mlogregr_result AS
$$
SELECT MADLIB_SCHEMA.mlogregr($1, $2, $3, $4, $5, 0.0001, 0);
$$ LANGUAGE sql VOLATILE;