blob: e7124fc73decfb6c678b7a12e749ab339bd8b116 [file] [log] [blame]
/* ----------------------------------------------------------------------- *//**
*
* @file logistic.sql_in
*
* @brief SQL functions for logistic regression
* @date January 2011
*
* @sa For a brief introduction to logistic regression, see the
* module description \ref grp_logreg.
*
*//* ----------------------------------------------------------------------- */
m4_include(`SQLCommon.m4')
/**
@addtogroup grp_logreg
@about
(Binomial) Logistic regression refers to a stochastic model in which the
conditional mean of the dependent dichotomous variable (usually denoted
\f$ Y \in \{ 0,1 \} \f$) is the logistic function of an affine function of the
vector of independent variables (usually denoted \f$ \boldsymbol x \f$). That
is,
\f[
E[Y \mid \boldsymbol x] = \sigma(\boldsymbol c^T \boldsymbol x)
\f]
for some unknown vector of coefficients \f$ \boldsymbol c \f$ and where
\f$ \sigma(x) = \frac{1}{1 + \exp(-x)} \f$ is the logistic function. Logistic
regression finds the vector of coefficients \f$ \boldsymbol c \f$ that maximizes
the likelihood of the observations.
Let
- \f$ \boldsymbol y \in \{ 0,1 \}^n \f$ denote the vector of observed dependent
variables, with \f$ n \f$ rows, containing the observed values of the
dependent variable,
- \f$ X \in \mathbf R^{n \times k} \f$ denote the design matrix with \f$ k \f$
columns and \f$ n \f$ rows, containing all observed vectors of independent
variables \f$ \boldsymbol x_i \f$ as rows.
By definition,
\f[
P[Y = y_i | \boldsymbol x_i]
= \sigma((-1)^{y_i} \cdot \boldsymbol c^T \boldsymbol x_i)
\,.
\f]
Maximizing the likelihood
\f$ \prod_{i=1}^n \Pr(Y = y_i \mid \boldsymbol x_i) \f$
is equivalent to maximizing the log-likelihood
\f$ \sum_{i=1}^n \log \Pr(Y = y_i \mid \boldsymbol x_i) \f$, which simplifies to
\f[
l(\boldsymbol c) =
-\sum_{i=1}^n \log(1 + \exp((-1)^{y_i}
\cdot \boldsymbol c^T \boldsymbol x_i))
\,.
\f]
The Hessian of this objective is \f$ H = -X^T A X \f$ where
\f$ A = \text{diag}(a_1, \dots, a_n) \f$ is the diagonal matrix with
\f$
a_i = \sigma(\boldsymbol c^T \boldsymbol x)
\cdot
\sigma(-\boldsymbol c^T \boldsymbol x)
\,.
\f$
Since \f$ H \f$ is non-positive definite, \f$ l(\boldsymbol c) \f$ is convex.
There are many techniques for solving convex optimization problems. Currently,
logistic regression in MADlib can use one of two algorithms:
- Iteratively Reweighted Least Squares
- A conjugate-gradient approach, also known as Fletcher-Reeves method in the
literature, where we use the Hestenes-Stiefel rule for calculating the step
size.
We estimate the standard error for coefficient \f$ i \f$ as
\f[
\mathit{se}(c_i) = \left( (X^T A X)^{-1} \right)_{ii}
\,.
\f]
The Wald z-statistic is
\f[
z_i = \frac{c_i}{\mathit{se}(c_i)}
\,.
\f]
The Wald \f$ p \f$-value for coefficient \f$ i \f$ gives the probability (under
the assumptions inherent in the Wald test) that the null hypothesis
(\f$ c_i = 0 \f$) is false, i.e., the probability that \f$ c_i \f$ differs
significantly from 0. Letting \f$ F \f$ denote the cumulative density
function of a standard normal distribution, the Wald \f$ p \f$-value for
coefficient \f$ i \f$ is therefore
\f[
p_i = \Pr(|Z| \geq |z_i|) = 2 \cdot (1 - F( |z_i| ))
\f]
where \f$ Z \f$ is a standard normally distributed random variable.
The odds ratio for coefficient \f$ i \f$ is estimated as \f$ \exp(c_i) \f$.
@input
The training data is expected to be of the following form:\n
<pre>{TABLE|VIEW} <em>sourceName</em> (
...
<em>dependentVariable</em> BOOLEAN,
<em>independentVariables</em> FLOAT8[],
...
)</pre>
@usage
- Get vector of coefficients \f$ \boldsymbol c \f$ and all diagnostic
statistics:\n
<pre>SELECT * FROM \ref logregr('<em>sourceName</em>', '<em>dependentVariable</em>', '<em>independentVariables</em>');</pre>
Output:
<pre>coef | log_likelihood | std_err | z_stats | p_values | odds_ratios
-----+----------------+---------+---------+----------+------------
...
</pre>
- Get vector of coefficients \f$ \boldsymbol c \f$:\n
<pre>SELECT (\ref logregr('<em>sourceName</em>', '<em>dependentVariable</em>', '<em>independentVariables</em>')).coef;</pre>
- Get array of coefficients \f$ \boldsymbol c \f$, log-likelihood of
determination \f$ l(\boldsymbol c) \f$, and array of p-values \f$ \boldsymbol p \f$:
<pre>SELECT coef, log_likelihood, p_values
FROM \ref logregr('<em>sourceName</em>', '<em>dependentVariable</em>', '<em>independentVariables</em>');</pre>
@examp
-# Create the sample data set:
@verbatim
# SELECT * FROM data;
r1 | val
---------------------------------------------+-----
{1,-0.537208849564195,0.0507612079381943} | t
{1,-1.83960389345884,0.0875045731663704} | t
{1,-4.28730633109808,0.839790890924633} | t
{1,2.4249863717705,0.499282206874341} | t
{1,-4.41880372352898,0.883616517763585} | t
{1,4.16272420901805,0.598018785007298} | t
...
@endverbatim
-# Run the logistic regression function:
@verbatim
# SELECT * FROM madlib.logregr('data', 'val', 'r1', 100, 'irls', 0.001);
coef | log_likelihood | std_err | z_stats | p_values | odds_ratios
--------------------------------------------------------+-------------------+----------------------------------------------------------+-------------------------------------------------------+---------------------------------------------------------------+------------------------------------------------------
{4.93532185789865,0.469393180445221,0.432089528467682} | -173.514819452336 | {0.407205759965291,0.0881607217787946,0.591853189786052} | {12.1199706465825,5.32428921830951,0.730062008492135} | {8.27887853333452e-34,1.01348545192981e-07,0.465352282469966} | {139.117911525733,1.59902357997671,1.54047302517705}
(1 row)
@endverbatim
@literature
A somewhat random selection of nice write-ups, with valuable pointers into
further literature:
[1] Cosma Shalizi: Statistics 36-350: Data Mining, Lecture Notes, 18 November
2009, http://www.stat.cmu.edu/~cshalizi/350/lectures/26/lecture-26.pdf
[2] Thomas P. Minka: A comparison of numerical optimizers for logistic
regression, 2003 (revised Mar 26, 2007),
http://research.microsoft.com/en-us/um/people/minka/papers/logreg/minka-logreg.pdf
[3] Paul Komarek, Andrew W. Moore: Making Logistic Regression A Core Data Mining
Tool With TR-IRLS, IEEE International Conference on Data Mining 2005,
pp. 685-688, http://komarix.org/ac/papers/tr-irls.short.pdf
@sa File logistic.sql_in (documenting the SQL functions)
@internal
@sa Namespace logistic (documenting the driver/outer loop implemented in
Python), Classes
\ref madlib::modules::regress::LogisticRegressionIRLS "LogisticRegressionIRLS"
and
\ref madlib::modules::regress::LogisticRegressionCG "LogisticRegressionCG"
documenting the implementation in C++
@endinternal
*/
DROP TYPE IF EXISTS MADLIB_SCHEMA.logregr_result;
CREATE TYPE MADLIB_SCHEMA.logregr_result AS (
coef DOUBLE PRECISION[],
log_likelihood DOUBLE PRECISION,
std_err DOUBLE PRECISION[],
z_stats DOUBLE PRECISION[],
p_values DOUBLE PRECISION[],
odds_ratios DOUBLE PRECISION[]
);
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.logregr_cg_step_transition(
DOUBLE PRECISION[],
BOOLEAN,
DOUBLE PRECISION[],
DOUBLE PRECISION[])
RETURNS DOUBLE PRECISION[]
AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE;
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.logregr_irls_step_transition(
DOUBLE PRECISION[],
BOOLEAN,
DOUBLE PRECISION[],
DOUBLE PRECISION[])
RETURNS DOUBLE PRECISION[]
AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE;
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.logregr_cg_step_merge_states(
state1 DOUBLE PRECISION[],
state2 DOUBLE PRECISION[])
RETURNS DOUBLE PRECISION[]
AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE STRICT;
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.logregr_irls_step_merge_states(
state1 DOUBLE PRECISION[],
state2 DOUBLE PRECISION[])
RETURNS DOUBLE PRECISION[]
AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE STRICT;
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.logregr_cg_step_final(
state DOUBLE PRECISION[])
RETURNS DOUBLE PRECISION[]
AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE STRICT;
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.logregr_irls_step_final(
state DOUBLE PRECISION[])
RETURNS DOUBLE PRECISION[]
AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE STRICT;
DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.logreg_cg_step(
/*+ y */ BOOLEAN,
/*+ x */ DOUBLE PRECISION[],
/*+ previous_state" */ DOUBLE PRECISION[]);
/**
* @internal
* @brief Perform one iteration of the conjugate-gradient method for computing
* logistic regression
*/
CREATE AGGREGATE MADLIB_SCHEMA.logregr_cg_step(
/*+ y */ BOOLEAN,
/*+ x */ DOUBLE PRECISION[],
/*+ previous_state */ DOUBLE PRECISION[]) (
STYPE=DOUBLE PRECISION[],
SFUNC=MADLIB_SCHEMA.logregr_cg_step_transition,
m4_ifdef(`GREENPLUM',`prefunc=MADLIB_SCHEMA.logregr_cg_step_merge_states,')
FINALFUNC=MADLIB_SCHEMA.logregr_cg_step_final,
INITCOND='{0,0,0,0,0,0}'
);
DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.logreg_irls_step(
/*+ y */ BOOLEAN,
/*+ x */ DOUBLE PRECISION[],
/*+ previous_state */ DOUBLE PRECISION[]);
/**
* @internal
* @brief Perform one iteration of the iteratively-reweighted-least-squares
* method for computing linear regression
*/
CREATE AGGREGATE MADLIB_SCHEMA.logregr_irls_step(
/*+ y */ BOOLEAN,
/*+ x */ DOUBLE PRECISION[],
/*+ previous_state */ DOUBLE PRECISION[]) (
STYPE=DOUBLE PRECISION[],
SFUNC=MADLIB_SCHEMA.logregr_irls_step_transition,
m4_ifdef(`GREENPLUM',`prefunc=MADLIB_SCHEMA.logregr_irls_step_merge_states,')
FINALFUNC=MADLIB_SCHEMA.logregr_irls_step_final,
INITCOND='{0,0,0}'
);
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.internal_logregr_cg_step_distance(
/*+ state1 */ DOUBLE PRECISION[],
/*+ state2 */ DOUBLE PRECISION[])
RETURNS DOUBLE PRECISION AS
'MODULE_PATHNAME'
LANGUAGE c IMMUTABLE STRICT;
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.internal_logregr_cg_result(
/*+ state */ DOUBLE PRECISION[])
RETURNS MADLIB_SCHEMA.logregr_result AS
'MODULE_PATHNAME'
LANGUAGE c IMMUTABLE STRICT;
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.internal_logregr_irls_step_distance(
/*+ state1 */ DOUBLE PRECISION[],
/*+ state2 */ DOUBLE PRECISION[])
RETURNS DOUBLE PRECISION AS
'MODULE_PATHNAME'
LANGUAGE c IMMUTABLE STRICT;
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.internal_logregr_irls_result(
/*+ state */ DOUBLE PRECISION[])
RETURNS MADLIB_SCHEMA.logregr_result AS
'MODULE_PATHNAME'
LANGUAGE c IMMUTABLE STRICT;
-- begin functions for logistic-regression coefficients
-- We only need to document the last one (unfortunately, in Greenplum we have to
-- use function overloading instead of default arguments).
CREATE FUNCTION MADLIB_SCHEMA.compute_logregr(
"source" VARCHAR,
"depColumn" VARCHAR,
"indepColumn" VARCHAR,
"numIterations" INTEGER /*+ DEFAULT 20 */,
"optimizer" VARCHAR /*+ DEFAULT 'irls' */,
"precision" DOUBLE PRECISION /*+ DEFAULT 0.0001 */)
RETURNS INTEGER
AS $$PythonFunction(regress, logistic, compute_logregr)$$
LANGUAGE plpythonu VOLATILE;
/**
* @brief Compute logistic-regression coefficients and diagnostic statistics
*
* To include an intercept in the model, set one coordinate in the
* <tt>independentVariables</tt> array to 1.
*
* @param source Name of the source relation containing the training data
* @param depColumn Name of the dependent column (of type BOOLEAN)
* @param indepColumn Name of the independent column (of type DOUBLE
* PRECISION[])
* @param numIterations The maximum number of iterations
* @param optimizer The optimizer to use (either
* <tt>'irls'</tt>/<tt>'newton'</tt> for iteratively reweighted least
* squares or <tt>'cg'</tt> for conjugent gradient)
* @param precision The difference between log-likelihood values in successive
* iterations that should indicate convergence, or 0 indicating that
* log-likelihood values should be ignored
*
* @return A composite value:
* - <tt>coef FLOAT8[]</tt> - Array of coefficients, \f$ \boldsymbol c \f$
* - <tt>log_likelihood FLOAT8</tt> - Log-likelihood \f$ l(\boldsymbol c) \f$
* - <tt>std_err FLOAT8[]</tt> - Array of standard errors,
* \f$ \mathit{se}(c_1), \dots, \mathit{se}(c_k) \f$
* - <tt>z_stats FLOAT8[]</tt> - Array of Wald z-statistics, \f$ \boldsymbol z \f$
* - <tt>p_values FLOAT8[]</tt> - Array of Wald p-values, \f$ \boldsymbol p \f$
* - <tt>odds_ratios FLOAT8[]</tt>: Array of odds ratios,
* \f$ \mathit{odds}(c_1), \dots, \mathit{odds}(c_k) \f$
*
* @usage
* - Get vector of coefficients \f$ \boldsymbol c \f$ and all diagnostic
* statistics:\n
* <pre>SELECT * FROM logregr('<em>sourceName</em>', '<em>dependentVariable</em>', '<em>independentVariables</em>');</pre>
* - Get vector of coefficients \f$ \boldsymbol c \f$:\n
* <pre>SELECT (logregr('<em>sourceName</em>', '<em>dependentVariable</em>', '<em>independentVariables</em>')).coef;</pre>
* - Get array of coefficients \f$ \boldsymbol c \f$, log-likelihood of
* determination \f$ l(\boldsymbol c) \f$, and array of p-values \f$ \boldsymbol p \f$:
* <pre>SELECT coef, log_likelihood, p_values
*FROM logregr('<em>sourceName</em>', '<em>dependentVariable</em>', '<em>independentVariables</em>');</pre>
*
* @note This function starts an iterative algorithm. It is not an aggregate
* function. Source and column names have to be passed as strings (due to
* limitations of the SQL syntax).
*
* @internal
* @sa This function is a wrapper for logistic::compute_logregr(), which
* sets the default values.
*/
CREATE FUNCTION MADLIB_SCHEMA.logregr(
"source" VARCHAR,
"depColumn" VARCHAR,
"indepColumn" VARCHAR,
"numIterations" INTEGER /*+ DEFAULT 20 */,
"optimizer" VARCHAR /*+ DEFAULT 'irls' */,
"precision" DOUBLE PRECISION /*+ DEFAULT 0.0001 */)
RETURNS MADLIB_SCHEMA.logregr_result AS $$
DECLARE
theIteration INTEGER;
fnName VARCHAR;
theResult MADLIB_SCHEMA.logregr_result;
BEGIN
theIteration := (
SELECT MADLIB_SCHEMA.compute_logregr($1, $2, $3, $4, $5, $6)
);
-- Because of Greenplum bug MPP-10050, we have to use dynamic SQL (using
-- EXECUTE) in the following
-- Because of Greenplum bug MPP-6731, we have to hide the tuple-returning
-- function in a subquery
IF optimizer = 'irls' OR optimizer = 'newton' THEN
fnName := 'internal_logregr_irls_result';
ELSE
fnName := 'internal_logregr_cg_result';
END IF;
EXECUTE
$sql$
SELECT (result).*
FROM (
SELECT
MADLIB_SCHEMA.$sql$ || fnName || $sql$(_madlib_state) AS result
FROM _madlib_iterative_alg
WHERE _madlib_iteration = $sql$ || theIteration || $sql$
) subq
$sql$
INTO theResult;
RETURN theResult;
END;
$$ LANGUAGE plpgsql VOLATILE;
CREATE FUNCTION MADLIB_SCHEMA.logregr(
"source" VARCHAR,
"depColumn" VARCHAR,
"indepColumn" VARCHAR)
RETURNS MADLIB_SCHEMA.logregr_result AS
$$SELECT MADLIB_SCHEMA.logregr($1, $2, $3, 20, 'irls', 0.0001);$$
LANGUAGE sql VOLATILE;
CREATE FUNCTION MADLIB_SCHEMA.logregr(
"source" VARCHAR,
"depColumn" VARCHAR,
"indepColumn" VARCHAR,
"numIterations" INTEGER)
RETURNS MADLIB_SCHEMA.logregr_result AS
$$SELECT MADLIB_SCHEMA.logregr($1, $2, $3, $4, 'irls', 0.0001);$$
LANGUAGE sql VOLATILE;
CREATE FUNCTION MADLIB_SCHEMA.logregr(
"source" VARCHAR,
"depColumn" VARCHAR,
"indepColumn" VARCHAR,
"numIterations" INTEGER,
"optimizer" VARCHAR)
RETURNS MADLIB_SCHEMA.logregr_result AS
$$SELECT MADLIB_SCHEMA.logregr($1, $2, $3, $4, $5, 0.0001);$$
LANGUAGE sql VOLATILE;