| /* ----------------------------------------------------------------------- *//** |
| * |
| * @file logistic.sql_in |
| * |
| * @brief SQL functions for logistic regression |
| * @date January 2011 |
| * |
| * @sa For a brief introduction to logistic regression, see the |
| * module description \ref grp_logreg. |
| * |
| *//* ----------------------------------------------------------------------- */ |
| |
| m4_include(`SQLCommon.m4') |
| |
| /** |
| @addtogroup grp_logreg |
| |
| @about |
| |
| (Binomial) Logistic regression refers to a stochastic model in which the |
| conditional mean of the dependent dichotomous variable (usually denoted |
| \f$ Y \in \{ 0,1 \} \f$) is the logistic function of an affine function of the |
| vector of independent variables (usually denoted \f$ \boldsymbol x \f$). That |
| is, |
| \f[ |
| E[Y \mid \boldsymbol x] = \sigma(\boldsymbol c^T \boldsymbol x) |
| \f] |
| for some unknown vector of coefficients \f$ \boldsymbol c \f$ and where |
| \f$ \sigma(x) = \frac{1}{1 + \exp(-x)} \f$ is the logistic function. Logistic |
| regression finds the vector of coefficients \f$ \boldsymbol c \f$ that maximizes |
| the likelihood of the observations. |
| |
| Let |
| - \f$ \boldsymbol y \in \{ 0,1 \}^n \f$ denote the vector of observed dependent |
| variables, with \f$ n \f$ rows, containing the observed values of the |
| dependent variable, |
| - \f$ X \in \mathbf R^{n \times k} \f$ denote the design matrix with \f$ k \f$ |
| columns and \f$ n \f$ rows, containing all observed vectors of independent |
| variables \f$ \boldsymbol x_i \f$ as rows. |
| |
| By definition, |
| \f[ |
| P[Y = y_i | \boldsymbol x_i] |
| = \sigma((-1)^{y_i} \cdot \boldsymbol c^T \boldsymbol x_i) |
| \,. |
| \f] |
| Maximizing the likelihood |
| \f$ \prod_{i=1}^n \Pr(Y = y_i \mid \boldsymbol x_i) \f$ |
| is equivalent to maximizing the log-likelihood |
| \f$ \sum_{i=1}^n \log \Pr(Y = y_i \mid \boldsymbol x_i) \f$, which simplifies to |
| \f[ |
| l(\boldsymbol c) = |
| -\sum_{i=1}^n \log(1 + \exp((-1)^{y_i} |
| \cdot \boldsymbol c^T \boldsymbol x_i)) |
| \,. |
| \f] |
| The Hessian of this objective is \f$ H = -X^T A X \f$ where |
| \f$ A = \text{diag}(a_1, \dots, a_n) \f$ is the diagonal matrix with |
| \f$ |
| a_i = \sigma(\boldsymbol c^T \boldsymbol x) |
| \cdot |
| \sigma(-\boldsymbol c^T \boldsymbol x) |
| \,. |
| \f$ |
| Since \f$ H \f$ is non-positive definite, \f$ l(\boldsymbol c) \f$ is convex. |
| There are many techniques for solving convex optimization problems. Currently, |
| logistic regression in MADlib can use one of two algorithms: |
| - Iteratively Reweighted Least Squares |
| - A conjugate-gradient approach, also known as Fletcher-Reeves method in the |
| literature, where we use the Hestenes-Stiefel rule for calculating the step |
| size. |
| |
| We estimate the standard error for coefficient \f$ i \f$ as |
| \f[ |
| \mathit{se}(c_i) = \left( (X^T A X)^{-1} \right)_{ii} |
| \,. |
| \f] |
| The Wald z-statistic is |
| \f[ |
| z_i = \frac{c_i}{\mathit{se}(c_i)} |
| \,. |
| \f] |
| |
| The Wald \f$ p \f$-value for coefficient \f$ i \f$ gives the probability (under |
| the assumptions inherent in the Wald test) that the null hypothesis |
| (\f$ c_i = 0 \f$) is false, i.e., the probability that \f$ c_i \f$ differs |
| significantly from 0. Letting \f$ F \f$ denote the cumulative density |
| function of a standard normal distribution, the Wald \f$ p \f$-value for |
| coefficient \f$ i \f$ is therefore |
| \f[ |
| p_i = \Pr(|Z| \geq |z_i|) = 2 \cdot (1 - F( |z_i| )) |
| \f] |
| where \f$ Z \f$ is a standard normally distributed random variable. |
| |
| The odds ratio for coefficient \f$ i \f$ is estimated as \f$ \exp(c_i) \f$. |
| |
| |
| @input |
| |
| The training data is expected to be of the following form:\n |
| <pre>{TABLE|VIEW} <em>sourceName</em> ( |
| ... |
| <em>dependentVariable</em> BOOLEAN, |
| <em>independentVariables</em> FLOAT8[], |
| ... |
| )</pre> |
| |
| @usage |
| - Get vector of coefficients \f$ \boldsymbol c \f$ and all diagnostic |
| statistics:\n |
| <pre>SELECT * FROM \ref logregr('<em>sourceName</em>', '<em>dependentVariable</em>', '<em>independentVariables</em>');</pre> |
| Output: |
| <pre>coef | log_likelihood | std_err | z_stats | p_values | odds_ratios |
| -----+----------------+---------+---------+----------+------------ |
| ... |
| </pre> |
| - Get vector of coefficients \f$ \boldsymbol c \f$:\n |
| <pre>SELECT (\ref logregr('<em>sourceName</em>', '<em>dependentVariable</em>', '<em>independentVariables</em>')).coef;</pre> |
| - Get array of coefficients \f$ \boldsymbol c \f$, log-likelihood of |
| determination \f$ l(\boldsymbol c) \f$, and array of p-values \f$ \boldsymbol p \f$: |
| <pre>SELECT coef, log_likelihood, p_values |
| FROM \ref logregr('<em>sourceName</em>', '<em>dependentVariable</em>', '<em>independentVariables</em>');</pre> |
| |
| @examp |
| |
| -# Create the sample data set: |
| @verbatim |
| # SELECT * FROM data; |
| r1 | val |
| ---------------------------------------------+----- |
| {1,-0.537208849564195,0.0507612079381943} | t |
| {1,-1.83960389345884,0.0875045731663704} | t |
| {1,-4.28730633109808,0.839790890924633} | t |
| {1,2.4249863717705,0.499282206874341} | t |
| {1,-4.41880372352898,0.883616517763585} | t |
| {1,4.16272420901805,0.598018785007298} | t |
| ... |
| @endverbatim |
| -# Run the logistic regression function: |
| @verbatim |
| # SELECT * FROM madlib.logregr('data', 'val', 'r1', 100, 'irls', 0.001); |
| |
| coef | log_likelihood | std_err | z_stats | p_values | odds_ratios |
| --------------------------------------------------------+-------------------+----------------------------------------------------------+-------------------------------------------------------+---------------------------------------------------------------+------------------------------------------------------ |
| {4.93532185789865,0.469393180445221,0.432089528467682} | -173.514819452336 | {0.407205759965291,0.0881607217787946,0.591853189786052} | {12.1199706465825,5.32428921830951,0.730062008492135} | {8.27887853333452e-34,1.01348545192981e-07,0.465352282469966} | {139.117911525733,1.59902357997671,1.54047302517705} |
| (1 row) |
| @endverbatim |
| |
| @literature |
| |
| A somewhat random selection of nice write-ups, with valuable pointers into |
| further literature: |
| |
| [1] Cosma Shalizi: Statistics 36-350: Data Mining, Lecture Notes, 18 November |
| 2009, http://www.stat.cmu.edu/~cshalizi/350/lectures/26/lecture-26.pdf |
| |
| [2] Thomas P. Minka: A comparison of numerical optimizers for logistic |
| regression, 2003 (revised Mar 26, 2007), |
| http://research.microsoft.com/en-us/um/people/minka/papers/logreg/minka-logreg.pdf |
| |
| [3] Paul Komarek, Andrew W. Moore: Making Logistic Regression A Core Data Mining |
| Tool With TR-IRLS, IEEE International Conference on Data Mining 2005, |
| pp. 685-688, http://komarix.org/ac/papers/tr-irls.short.pdf |
| |
| @sa File logistic.sql_in (documenting the SQL functions) |
| |
| @internal |
| @sa Namespace logistic (documenting the driver/outer loop implemented in |
| Python), Classes |
| \ref madlib::modules::regress::LogisticRegressionIRLS "LogisticRegressionIRLS" |
| and |
| \ref madlib::modules::regress::LogisticRegressionCG "LogisticRegressionCG" |
| documenting the implementation in C++ |
| @endinternal |
| |
| */ |
| |
| DROP TYPE IF EXISTS MADLIB_SCHEMA.logregr_result; |
| CREATE TYPE MADLIB_SCHEMA.logregr_result AS ( |
| coef DOUBLE PRECISION[], |
| log_likelihood DOUBLE PRECISION, |
| std_err DOUBLE PRECISION[], |
| z_stats DOUBLE PRECISION[], |
| p_values DOUBLE PRECISION[], |
| odds_ratios DOUBLE PRECISION[] |
| ); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.logregr_cg_step_transition( |
| DOUBLE PRECISION[], |
| BOOLEAN, |
| DOUBLE PRECISION[], |
| DOUBLE PRECISION[]) |
| RETURNS DOUBLE PRECISION[] |
| AS 'MODULE_PATHNAME' |
| LANGUAGE C IMMUTABLE; |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.logregr_irls_step_transition( |
| DOUBLE PRECISION[], |
| BOOLEAN, |
| DOUBLE PRECISION[], |
| DOUBLE PRECISION[]) |
| RETURNS DOUBLE PRECISION[] |
| AS 'MODULE_PATHNAME' |
| LANGUAGE C IMMUTABLE; |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.logregr_cg_step_merge_states( |
| state1 DOUBLE PRECISION[], |
| state2 DOUBLE PRECISION[]) |
| RETURNS DOUBLE PRECISION[] |
| AS 'MODULE_PATHNAME' |
| LANGUAGE C IMMUTABLE STRICT; |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.logregr_irls_step_merge_states( |
| state1 DOUBLE PRECISION[], |
| state2 DOUBLE PRECISION[]) |
| RETURNS DOUBLE PRECISION[] |
| AS 'MODULE_PATHNAME' |
| LANGUAGE C IMMUTABLE STRICT; |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.logregr_cg_step_final( |
| state DOUBLE PRECISION[]) |
| RETURNS DOUBLE PRECISION[] |
| AS 'MODULE_PATHNAME' |
| LANGUAGE C IMMUTABLE STRICT; |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.logregr_irls_step_final( |
| state DOUBLE PRECISION[]) |
| RETURNS DOUBLE PRECISION[] |
| AS 'MODULE_PATHNAME' |
| LANGUAGE C IMMUTABLE STRICT; |
| |
| DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.logreg_cg_step( |
| /*+ y */ BOOLEAN, |
| /*+ x */ DOUBLE PRECISION[], |
| /*+ previous_state" */ DOUBLE PRECISION[]); |
| |
| /** |
| * @internal |
| * @brief Perform one iteration of the conjugate-gradient method for computing |
| * logistic regression |
| */ |
| CREATE AGGREGATE MADLIB_SCHEMA.logregr_cg_step( |
| /*+ y */ BOOLEAN, |
| /*+ x */ DOUBLE PRECISION[], |
| /*+ previous_state */ DOUBLE PRECISION[]) ( |
| |
| STYPE=DOUBLE PRECISION[], |
| SFUNC=MADLIB_SCHEMA.logregr_cg_step_transition, |
| m4_ifdef(`GREENPLUM',`prefunc=MADLIB_SCHEMA.logregr_cg_step_merge_states,') |
| FINALFUNC=MADLIB_SCHEMA.logregr_cg_step_final, |
| INITCOND='{0,0,0,0,0,0}' |
| ); |
| |
| DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.logreg_irls_step( |
| /*+ y */ BOOLEAN, |
| /*+ x */ DOUBLE PRECISION[], |
| /*+ previous_state */ DOUBLE PRECISION[]); |
| |
| /** |
| * @internal |
| * @brief Perform one iteration of the iteratively-reweighted-least-squares |
| * method for computing linear regression |
| */ |
| CREATE AGGREGATE MADLIB_SCHEMA.logregr_irls_step( |
| /*+ y */ BOOLEAN, |
| /*+ x */ DOUBLE PRECISION[], |
| /*+ previous_state */ DOUBLE PRECISION[]) ( |
| |
| STYPE=DOUBLE PRECISION[], |
| SFUNC=MADLIB_SCHEMA.logregr_irls_step_transition, |
| m4_ifdef(`GREENPLUM',`prefunc=MADLIB_SCHEMA.logregr_irls_step_merge_states,') |
| FINALFUNC=MADLIB_SCHEMA.logregr_irls_step_final, |
| INITCOND='{0,0,0}' |
| ); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.internal_logregr_cg_step_distance( |
| /*+ state1 */ DOUBLE PRECISION[], |
| /*+ state2 */ DOUBLE PRECISION[]) |
| RETURNS DOUBLE PRECISION AS |
| 'MODULE_PATHNAME' |
| LANGUAGE c IMMUTABLE STRICT; |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.internal_logregr_cg_result( |
| /*+ state */ DOUBLE PRECISION[]) |
| RETURNS MADLIB_SCHEMA.logregr_result AS |
| 'MODULE_PATHNAME' |
| LANGUAGE c IMMUTABLE STRICT; |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.internal_logregr_irls_step_distance( |
| /*+ state1 */ DOUBLE PRECISION[], |
| /*+ state2 */ DOUBLE PRECISION[]) |
| RETURNS DOUBLE PRECISION AS |
| 'MODULE_PATHNAME' |
| LANGUAGE c IMMUTABLE STRICT; |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.internal_logregr_irls_result( |
| /*+ state */ DOUBLE PRECISION[]) |
| RETURNS MADLIB_SCHEMA.logregr_result AS |
| 'MODULE_PATHNAME' |
| LANGUAGE c IMMUTABLE STRICT; |
| |
| |
| -- begin functions for logistic-regression coefficients |
| -- We only need to document the last one (unfortunately, in Greenplum we have to |
| -- use function overloading instead of default arguments). |
| CREATE FUNCTION MADLIB_SCHEMA.compute_logregr( |
| "source" VARCHAR, |
| "depColumn" VARCHAR, |
| "indepColumn" VARCHAR, |
| "numIterations" INTEGER /*+ DEFAULT 20 */, |
| "optimizer" VARCHAR /*+ DEFAULT 'irls' */, |
| "precision" DOUBLE PRECISION /*+ DEFAULT 0.0001 */) |
| RETURNS INTEGER |
| AS $$PythonFunction(regress, logistic, compute_logregr)$$ |
| LANGUAGE plpythonu VOLATILE; |
| |
| /** |
| * @brief Compute logistic-regression coefficients and diagnostic statistics |
| * |
| * To include an intercept in the model, set one coordinate in the |
| * <tt>independentVariables</tt> array to 1. |
| * |
| * @param source Name of the source relation containing the training data |
| * @param depColumn Name of the dependent column (of type BOOLEAN) |
| * @param indepColumn Name of the independent column (of type DOUBLE |
| * PRECISION[]) |
| * @param numIterations The maximum number of iterations |
| * @param optimizer The optimizer to use (either |
| * <tt>'irls'</tt>/<tt>'newton'</tt> for iteratively reweighted least |
| * squares or <tt>'cg'</tt> for conjugent gradient) |
| * @param precision The difference between log-likelihood values in successive |
| * iterations that should indicate convergence, or 0 indicating that |
| * log-likelihood values should be ignored |
| * |
| * @return A composite value: |
| * - <tt>coef FLOAT8[]</tt> - Array of coefficients, \f$ \boldsymbol c \f$ |
| * - <tt>log_likelihood FLOAT8</tt> - Log-likelihood \f$ l(\boldsymbol c) \f$ |
| * - <tt>std_err FLOAT8[]</tt> - Array of standard errors, |
| * \f$ \mathit{se}(c_1), \dots, \mathit{se}(c_k) \f$ |
| * - <tt>z_stats FLOAT8[]</tt> - Array of Wald z-statistics, \f$ \boldsymbol z \f$ |
| * - <tt>p_values FLOAT8[]</tt> - Array of Wald p-values, \f$ \boldsymbol p \f$ |
| * - <tt>odds_ratios FLOAT8[]</tt>: Array of odds ratios, |
| * \f$ \mathit{odds}(c_1), \dots, \mathit{odds}(c_k) \f$ |
| * |
| * @usage |
| * - Get vector of coefficients \f$ \boldsymbol c \f$ and all diagnostic |
| * statistics:\n |
| * <pre>SELECT * FROM logregr('<em>sourceName</em>', '<em>dependentVariable</em>', '<em>independentVariables</em>');</pre> |
| * - Get vector of coefficients \f$ \boldsymbol c \f$:\n |
| * <pre>SELECT (logregr('<em>sourceName</em>', '<em>dependentVariable</em>', '<em>independentVariables</em>')).coef;</pre> |
| * - Get array of coefficients \f$ \boldsymbol c \f$, log-likelihood of |
| * determination \f$ l(\boldsymbol c) \f$, and array of p-values \f$ \boldsymbol p \f$: |
| * <pre>SELECT coef, log_likelihood, p_values |
| *FROM logregr('<em>sourceName</em>', '<em>dependentVariable</em>', '<em>independentVariables</em>');</pre> |
| * |
| * @note This function starts an iterative algorithm. It is not an aggregate |
| * function. Source and column names have to be passed as strings (due to |
| * limitations of the SQL syntax). |
| * |
| * @internal |
| * @sa This function is a wrapper for logistic::compute_logregr(), which |
| * sets the default values. |
| */ |
| CREATE FUNCTION MADLIB_SCHEMA.logregr( |
| "source" VARCHAR, |
| "depColumn" VARCHAR, |
| "indepColumn" VARCHAR, |
| "numIterations" INTEGER /*+ DEFAULT 20 */, |
| "optimizer" VARCHAR /*+ DEFAULT 'irls' */, |
| "precision" DOUBLE PRECISION /*+ DEFAULT 0.0001 */) |
| RETURNS MADLIB_SCHEMA.logregr_result AS $$ |
| DECLARE |
| theIteration INTEGER; |
| fnName VARCHAR; |
| theResult MADLIB_SCHEMA.logregr_result; |
| BEGIN |
| theIteration := ( |
| SELECT MADLIB_SCHEMA.compute_logregr($1, $2, $3, $4, $5, $6) |
| ); |
| -- Because of Greenplum bug MPP-10050, we have to use dynamic SQL (using |
| -- EXECUTE) in the following |
| -- Because of Greenplum bug MPP-6731, we have to hide the tuple-returning |
| -- function in a subquery |
| IF optimizer = 'irls' OR optimizer = 'newton' THEN |
| fnName := 'internal_logregr_irls_result'; |
| ELSE |
| fnName := 'internal_logregr_cg_result'; |
| END IF; |
| EXECUTE |
| $sql$ |
| SELECT (result).* |
| FROM ( |
| SELECT |
| MADLIB_SCHEMA.$sql$ || fnName || $sql$(_madlib_state) AS result |
| FROM _madlib_iterative_alg |
| WHERE _madlib_iteration = $sql$ || theIteration || $sql$ |
| ) subq |
| $sql$ |
| INTO theResult; |
| RETURN theResult; |
| END; |
| $$ LANGUAGE plpgsql VOLATILE; |
| |
| CREATE FUNCTION MADLIB_SCHEMA.logregr( |
| "source" VARCHAR, |
| "depColumn" VARCHAR, |
| "indepColumn" VARCHAR) |
| RETURNS MADLIB_SCHEMA.logregr_result AS |
| $$SELECT MADLIB_SCHEMA.logregr($1, $2, $3, 20, 'irls', 0.0001);$$ |
| LANGUAGE sql VOLATILE; |
| |
| CREATE FUNCTION MADLIB_SCHEMA.logregr( |
| "source" VARCHAR, |
| "depColumn" VARCHAR, |
| "indepColumn" VARCHAR, |
| "numIterations" INTEGER) |
| RETURNS MADLIB_SCHEMA.logregr_result AS |
| $$SELECT MADLIB_SCHEMA.logregr($1, $2, $3, $4, 'irls', 0.0001);$$ |
| LANGUAGE sql VOLATILE; |
| |
| CREATE FUNCTION MADLIB_SCHEMA.logregr( |
| "source" VARCHAR, |
| "depColumn" VARCHAR, |
| "indepColumn" VARCHAR, |
| "numIterations" INTEGER, |
| "optimizer" VARCHAR) |
| RETURNS MADLIB_SCHEMA.logregr_result AS |
| $$SELECT MADLIB_SCHEMA.logregr($1, $2, $3, $4, $5, 0.0001);$$ |
| LANGUAGE sql VOLATILE; |