src/ports/postgres/modules/regress/multilogistic.sql_in - madlib - Git at Google

 /* ----------------------------------------------------------------------- *//**
  *
  * @file multilogistic.sql_in
  *
  * @brief SQL functions for multinomial logistic regression
  * @date July 2012
  *
  * @sa For a brief introduction to multinomial logistic regression, see the
  *     module description \ref grp_mlogreg.
  *
  *//* ----------------------------------------------------------------------- */

 m4_include(`SQLCommon.m4')

 /**
 @addtogroup grp_mlogreg

 @about

 Multinomial logistic regression is a widely used regression analysis tool that models the outcomes of categorical dependent
 random variables (denoted \f$ Y \in \{ 0,1,2 \ldots k \} \f$). The models assumes that the conditional mean of the
 dependant categorical variables is the logistic function of an affine combination of independent
 variables (usually denoted \f$ \boldsymbol x \f$). That is,
 \f[
     E[Y \mid \boldsymbol x] = \sigma(\boldsymbol c^T \boldsymbol x)
 \f]
 for some unknown vector of coefficients \f$ \boldsymbol c \f$ and where
 \f$ \sigma(x) = \frac{1}{1 + \exp(-x)} \f$ is the logistic function. Multinomial Logistic
 regression finds the vector of coefficients \f$ \boldsymbol c \f$ that maximizes
 the likelihood of the observations.

 Let
 - \f$ \boldsymbol y \in \{ 0,1 \}^{n \times k} \f$ denote the vector of observed dependent
   variables, with \f$ n \f$ rows and \f$ k \f$ columns, containing the observed values of the
   dependent variable,
 - \f$ X \in \mathbf R^{n \times k} \f$ denote the design matrix with \f$ k \f$
   columns and \f$ n \f$ rows, containing all observed vectors of independent
   variables \f$ \boldsymbol x_i \f$ as rows.

 By definition,
 \f[
     P[Y = y_i | \boldsymbol x_i]
     =   \sigma((-1)^{y_i} \cdot \boldsymbol c^T \boldsymbol x_i)
     \,.
 \f]
 Maximizing the likelihood
 \f$ \prod_{i=1}^n \Pr(Y = y_i \mid \boldsymbol x_i) \f$
 is equivalent to maximizing the log-likelihood
 \f$ \sum_{i=1}^n \log \Pr(Y = y_i \mid \boldsymbol x_i) \f$, which simplifies to
 \f[
     l(\boldsymbol c) =
         -\sum_{i=1}^n \log(1 + \exp((-1)^{y_i}
             \cdot \boldsymbol c^T \boldsymbol x_i))
     \,.
 \f]
 The Hessian of this objective is \f$ H = -X^T A X \f$ where
 \f$ A = \text{diag}(a_1, \dots, a_n) \f$ is the diagonal matrix with
 \f$
     a_i = \sigma(\boldsymbol c^T \boldsymbol x)
           \cdot
           \sigma(-\boldsymbol c^T \boldsymbol x)
     \,.
 \f$
 Since \f$ H \f$ is non-positive definite, \f$ l(\boldsymbol c) \f$ is convex.
 There are many techniques for solving convex optimization problems. Currently,
 logistic regression in MADlib can use:
 - Iteratively Reweighted Least Squares

 We estimate the standard error for coefficient \f$ i \f$ as
 \f[
     \mathit{se}(c_i) = \left( (X^T A X)^{-1} \right)_{ii}
     \,.
 \f]
 The Wald z-statistic is
 \f[
     z_i = \frac{c_i}{\mathit{se}(c_i)}
     \,.
 \f]

 The Wald \f$ p \f$-value for coefficient \f$ i \f$ gives the probability (under
 the assumptions inherent in the Wald test) of seeing a value at least as extreme
 as the one observed, provided that the null hypothesis (\f$ c_i = 0 \f$) is
 true. Letting \f$ F \f$ denote the cumulative density function of a standard
 normal distribution, the Wald \f$ p \f$-value for coefficient \f$ i \f$ is
 therefore
 \f[
     p_i = \Pr(|Z| \geq |z_i|) = 2 \cdot (1 - F( |z_i| ))
 \f]
 where \f$ Z \f$ is a standard normally distributed random variable.

 The odds ratio for coefficient \f$ i \f$ is estimated as \f$ \exp(c_i) \f$.

 The condition number is computed as \f$ \kappa(X^T A X) \f$ during the iteration
 immediately <em>preceding</em> convergence (i.e., \f$ A \f$ is computed using
 the coefficients of the previous iteration). A large condition number (say, more
 than 1000) indicates the presence of significant multicollinearity.

 The multinomial logistic regression uses a default reference category of zero,
 and the regression coefficients in the output are in the order described below.
 For a problem with
 \f$ K \f$ dependent variables \f$ (1, ..., K) \f$ and \f$ J \f$ categories \f$ (0, ..., J-1)
 \f$, let \f$ {m_{k,j}} \f$ denote the coefficient for dependent variable \f$ k
 \f$ and  category \f$ j \f$.   The output is \f$ {m_{k_1, j_0}, m_{k_1, j_1}
 \ldots m_{k_1, j_{J-1}}, m_{k_2, j_0}, m_{k_2, j_1}, \ldots  m_{k_2, j_{J-1}} \ldots m_{k_K, j_{J-1}}} \f$.
 The order is NOT CONSISTENT with the  multinomial regression marginal effect
 calculation with function <em>marginal_mlogregr</em>.   This is deliberate
 because the interfaces of all multinomial regressions (robust, clustered, ...)
 will be moved to match that used in marginal.


 @input

 The training data is expected to be of the following form:\n
 <pre>{TABLE|VIEW} <em>sourceName</em> (
     ...
     <em>dependentVariable</em> INTEGER,
     <em>independentVariables</em> FLOAT8[],
     ...
 )</pre>

 @usage
 - The number of independent variables cannot exceed 65535.
 - The reference category ranges from [0, numCategories-1]. The default reference
 category is zero.
 - Get vector of coefficients \f$ \boldsymbol c \f$ and all diagnostic
   statistics:\n
   <pre>SELECT * FROM \ref mlogregr(
     '<em>sourceName</em>',
     '<em>dependentVariable</em>',
     '<em>independentVariables</em>' [,
     <em>numberOfIterations</em> [,
     '<em>optimizer</em>' [,
     <em>precision</em>, [,
     <em>ref_category</em>]]]]
 );</pre>
   Output:
 <pre>
 ref_category | coef | log_likelihood | std_err | z_stats | p_values | odds_ratios | condition_no | num_iterations\n  	    ----+------+----------------+---------+---------+----------+-------------+--------------+---------------
                                                ...
 </pre>
 - Get vector of coefficients \f$ \boldsymbol c \f$:\n
   <pre>SELECT (\ref mlogregr('<em>sourceName</em>', '<em>dependentVariable</em>', '<em>independentVariables</em>')).coef;</pre>
 - Get a subset of the output columns, e.g., only the array of coefficients
   \f$ \boldsymbol c \f$, the log-likelihood of determination
   \f$ l(\boldsymbol c) \f$, and the array of p-values \f$ \boldsymbol p \f$:
   <pre>SELECT coef, log_likelihood, p_values
 FROM \ref mlogregr('<em>sourceName</em>', '<em>dependentVariable</em>', '<em>independentVariables</em>');</pre>

 Note that the categories are encoded as integers with values from {0, 1, 2,..., numCategories-1}
 @examp

 -# Create the sample data set:
 @verbatim
 sql> SELECT * FROM data;
                   r1                      | val
 ---------------------------------------------+-----
  {1,3.01789340097457,0.454183579888195}   | 1
  {1,-2.59380532894284,0.602678326424211}  | 0
  {1,-1.30643094424158,0.151587064377964}  | 1
  {1,3.60722299199551,0.963550757616758}   | 1
  {1,-1.52197745628655,0.0782248834148049} | 1
  {1,-4.8746574902907,0.345104880165309}   | 0
 ...
 @endverbatim
 -# Run the multi-logistic regression function:
 @verbatim
 sql> \x on
 Expanded display is off.
 sql> SELECT * FROM mlogregr('data', 'val', '2', 'r1', 100, 'irls', 0.001);
 -[ RECORD 1 ]--+--------------------------------------------------------------
 coef           | {5.59049410898112,2.11077546770772,-0.237276684606453}
 log_likelihood | -467.214718489873
 std_err        | {0.318943457652178,0.101518723785383,0.294509929481773}
 z_stats        | {17.5281667482197,20.7919819024719,-0.805666162169712}
 p_values       | {8.73403463417837e-69,5.11539430631541e-96,0.420435365338518}
 odds_ratios    | {267.867942976278,8.2546400100702,0.788773016471171}
 condition_no   | 179.186118573205
 num_iterations | 9

 @endverbatim

 @literature

 A collection of nice write-ups, with valuable pointers into
 further literature:

 [1] Annette J . Dobson: An Introduction to Generalized Linear Models, Second Edition.  Nov 2001

 [2] Cosma Shalizi: Statistics 36-350: Data Mining, Lecture Notes, 18 November
     2009, http://www.stat.cmu.edu/~cshalizi/350/lectures/26/lecture-26.pdf

 [3] Srikrishna Sridhar, Mark Wellons, Caleb Welton: Multilogistic Regression:
 		Notes and References, Jul 12 2012, http://www.cs.wisc.edu/~srikris/mlogit.pdf

 [4] Scott A. Czepiel: Maximum Likelihood Estimation
 		of Logistic Regression Models: Theory and Implementation,
 		Retrieved Jul 12 2012, http://czep.net/stat/mlelr.pdf


 @sa File multilogistic.sql_in (documenting the SQL functions)

 @internal
 @sa Namespace multilogistic (documenting the driver/outer loop implemented in
     Python), Namespace
     \ref madlib::modules::regress documenting the implementation in C++
 @endinternal

 */


 DROP TYPE IF EXISTS MADLIB_SCHEMA.mlogregr_result;
 CREATE TYPE MADLIB_SCHEMA.mlogregr_result AS
 (
     ref_category    INTEGER,
     coef            DOUBLE PRECISION[],
     log_likelihood  DOUBLE PRECISION,
     std_err         DOUBLE PRECISION[],
     z_stats         DOUBLE PRECISION[],
     p_values        DOUBLE PRECISION[],
     odds_ratios     DOUBLE PRECISION[],
     condition_no    DOUBLE PRECISION,
     num_iterations  INTEGER
 );


 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__mlogregr_irls_step_transition
 (
     state           DOUBLE PRECISION[],
     y               INTEGER,
     num_categories  INTEGER,
     ref_category    INTEGER,
     x               DOUBLE PRECISION[],
     prev_state      DOUBLE PRECISION[]
 )
 RETURNS DOUBLE PRECISION[]
 AS 'MODULE_PATHNAME'
 LANGUAGE C IMMUTABLE;


 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__mlogregr_irls_step_merge_states
 (
     state1  DOUBLE PRECISION[],
     state2  DOUBLE PRECISION[]
 )
 RETURNS DOUBLE PRECISION[]
 AS 'MODULE_PATHNAME'
 LANGUAGE C IMMUTABLE STRICT;


 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__mlogregr_irls_step_final
 (
     state   DOUBLE PRECISION[]
 )
 RETURNS DOUBLE PRECISION[]
 AS 'MODULE_PATHNAME'
 LANGUAGE C IMMUTABLE STRICT;


 /**
  * @internal
  * @brief Perform one iteration of the iteratively-reweighted-least-squares
  *        method for computing linear regression
  */
 CREATE AGGREGATE MADLIB_SCHEMA.__mlogregr_irls_step(
     /*+ y */ INTEGER,
     /*+ numCategories */ INTEGER,
     /*+ ref_category */ INTEGER,
     /*+ x */ DOUBLE PRECISION[],
     /*+ previous_state */ DOUBLE PRECISION[]) (

     STYPE=DOUBLE PRECISION[],
     SFUNC=MADLIB_SCHEMA.__mlogregr_irls_step_transition,
     m4_ifdef(`__GREENPLUM__',`prefunc=MADLIB_SCHEMA.__mlogregr_irls_step_merge_states,')
     FINALFUNC=MADLIB_SCHEMA.__mlogregr_irls_step_final,
     INITCOND='{0,0,0,0,0}'
 );


 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__internal_mlogregr_irls_step_distance(
     /*+ state1 */ DOUBLE PRECISION[],
     /*+ state2 */ DOUBLE PRECISION[])
 RETURNS DOUBLE PRECISION AS
 'MODULE_PATHNAME'
 LANGUAGE c IMMUTABLE STRICT;

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__internal_mlogregr_irls_result(
     /*+ state */ DOUBLE PRECISION[])
 RETURNS MADLIB_SCHEMA.mlogregr_result AS
 'MODULE_PATHNAME'
 LANGUAGE c IMMUTABLE STRICT;


 -- We only need to document the last one (unfortunately, in Greenplum we have to
 -- use function overloading instead of default arguments).
 CREATE FUNCTION MADLIB_SCHEMA.__compute_mlogregr
 (
     source            VARCHAR,
     depvar            VARCHAR,
     indepvar          VARCHAR,
     numcategories     INTEGER,
     maxnumiterations  INTEGER,
     optimizer         VARCHAR,
     "precision"       DOUBLE PRECISION,
     ref_category      INTEGER
 )
 RETURNS INTEGER
 AS $$
     PythonFunction(regress, multilogistic, compute_mlogregr)
 $$ LANGUAGE plpythonu VOLATILE STRICT;

 /**
  * @brief Compute logistic-regression coefficients and diagnostic statistics
  *
  * To include an intercept in the model, set one coordinate in the
  * <tt>independentVariables</tt> array to 1.
  *
  * @param source Name of the source relation containing the training data
  * @param depvar Name of the dependent column (of type INTEGER < numcategories)
  * @param indepvar Name of the independent column (of type DOUBLE PRECISION[])
  * @param maxnumiterations The maximum number of iterations
  * @param optimizer The optimizer to use (
  *        <tt>'irls'</tt>/<tt>'newton'</tt> for iteratively reweighted least
  *        squares)
  * @param precision The difference between log-likelihood values in successive
  *        iterations that should indicate convergence. Note that a non-positive
  *        value here disables the convergence criterion, and execution will only
  *        stop after \ maxNumIterations iterations.
  * @param ref_category The reference category specified by the user
  *
  * @return A composite value:
  *  - <tt>ref_category INTEGER</tt> - Reference category
  *  - <tt>coef FLOAT8[]</tt> - Array of coefficients, \f$ \boldsymbol c \f$
  *  - <tt>log_likelihood FLOAT8</tt> - Log-likelihood \f$ l(\boldsymbol c) \f$
  *  - <tt>std_err FLOAT8[]</tt> - Array of standard errors,
  *    \f$ \mathit{se}(c_1), \dots, \mathit{se}(c_k) \f$
  *  - <tt>z_stats FLOAT8[]</tt> - Array of Wald z-statistics, \f$ \boldsymbol z \f$
  *  - <tt>p_values FLOAT8[]</tt> - Array of Wald p-values, \f$ \boldsymbol p \f$
  *  - <tt>odds_ratios FLOAT8[]</tt>: Array of odds ratios,
  *    \f$ \mathit{odds}(c_1), \dots, \mathit{odds}(c_k) \f$
  *  - <tt>condition_no FLOAT8</tt> - The condition number of matrix
  *    \f$ X^T A X \f$ during the iteration immediately <em>preceding</em>
  *    convergence (i.e., \f$ A \f$ is computed using the coefficients of the
  *    previous iteration)
  *  - <tt>num_iterations INTEGER</tt> - The number of iterations before the
  *    algorithm terminated
  *
  * @usage
  *  - Get vector of coefficients \f$ \boldsymbol c \f$ and all diagnostic
  *    statistics:\n
  *    <pre>SELECT * FROM mlogregr('<em>sourceName</em>', '<em>dependentVariable</em>',
  *    '<em>numCategories</em>', '<em>independentVariables</em>');</pre>
  *  - Get vector of coefficients \f$ \boldsymbol c \f$:\n
  *    <pre>SELECT (mlogregr('<em>sourceName</em>', '<em>dependentVariable</em>',
  *    '<em>numCategories</em>', '<em>independentVariables</em>')).coef;</pre>
  *  - Get a subset of the output columns, e.g., only the array of coefficients
  *    \f$ \boldsymbol c \f$, the log-likelihood of determination
  *    \f$ l(\boldsymbol c) \f$, and the array of p-values \f$ \boldsymbol p \f$:
  *    <pre>SELECT coef, log_likelihood, p_values
  *    FROM mlogregr('<em>sourceName</em>', '<em>dependentVariable</em>',
  *   '<em>numCategories</em>', '<em>independentVariables</em>');</pre>
  *
  * @note This function starts an iterative algorithm. It is not an aggregate
  *       function. Source and column names have to be passed as strings (due to
  *       limitations of the SQL syntax).
  *
  * @internal
  * @sa This function is a wrapper for multilogistic::__compute_mlogregr(), which
  *     sets the default values.
  */
 CREATE FUNCTION MADLIB_SCHEMA.mlogregr
 (
     source            VARCHAR,
     depvar            VARCHAR,
     indepvar          VARCHAR,
     maxnumiterations  INTEGER /*+ DEFAULT 20 */,
     optimizer         VARCHAR /*+ DEFAULT 'irls' */,
     "precision"       DOUBLE PRECISION /*+ DEFAULT 0.0001 */,
     ref_category      INTEGER
 )
 RETURNS MADLIB_SCHEMA.mlogregr_result AS $$
 DECLARE
     observed_count  INTEGER;
     theIteration    INTEGER;
     fnName          VARCHAR;
     theResult       MADLIB_SCHEMA.mlogregr_result;
     numcategories   INTEGER;
     min_category    INTEGER;
     max_category    INTEGER;
 BEGIN
     IF (source IS NULL OR trim(source) = '') THEN
         RAISE EXCEPTION 'Invalid source table given';
     END IF;

     IF (depvar IS NULL OR trim(depvar) = '') THEN
         RAISE EXCEPTION 'Invalid depvar given';
     END IF;
     IF (indepvar IS NULL OR trim(indepvar) = '') THEN
         RAISE EXCEPTION 'Invalid indepvar given';
     END IF;
     IF (maxnumiterations IS NULL OR maxnumiterations < 1) THEN
         RAISE EXCEPTION 'Number of max iterations must be positive';
     END IF;
     IF (optimizer IS NULL OR trim(optimizer) = '') THEN
         RAISE EXCEPTION 'Invalid optimizer given';
     END IF;
     IF (precision IS NULL) THEN
         RAISE EXCEPTION 'Invalid precision given.';
     END IF;
     IF (ref_category IS NULL OR ref_category < 0) THEN
         RAISE EXCEPTION 'Invalid ref_category given.';
     END IF;

     IF (SELECT atttypid::regtype <> 'INTEGER'::regtype
         FROM pg_attribute
         WHERE attrelid = source::regclass AND attname = depvar) THEN
         RAISE EXCEPTION 'The dependent variable column should be of type INTEGER';
     END IF;

     EXECUTE $sql$ SELECT count(DISTINCT $sql$ || depvar || $sql$ )
                     FROM $sql$ || textin(regclassout(source))
             INTO observed_count;
     numcategories := observed_count;

     EXECUTE $sql$ SELECT max($sql$ || depvar || $sql$ )
                     FROM $sql$ || textin(regclassout(source))
             INTO max_category;
     EXECUTE $sql$ SELECT min($sql$ || depvar || $sql$ )
                     FROM $sql$ || textin(regclassout(source))
             INTO min_category;

     IF max_category != numcategories - 1 OR min_category != 0 THEN
         RAISE EXCEPTION 'The value of the dependent variable should be in the
         range of [0, %)', numcategories;
     END IF;

     IF ref_category > numcategories -1 OR ref_category < 0 THEN
         RAISE EXCEPTION 'The value of the reference category should be in the
         range of [0, "%")', numcategories;
     END IF;

     IF optimizer = 'irls' OR optimizer = 'newton' THEN
         fnName := '__internal_mlogregr_irls_result';
     ELSE
         RAISE EXCEPTION 'Unknown optimizer (''%''). Must be "newton" or "irls"', optimizer;
     END IF;

     theIteration := (
         SELECT MADLIB_SCHEMA.__compute_mlogregr(
             $1, $2, $3, numcategories, $4, $5, $6, $7)
     );

     -- Because of Greenplum bug MPP-10050, we have to use dynamic SQL (using
     -- EXECUTE) in the following
     -- Because of Greenplum bug MPP-6731, we have to hide the tuple-returning
     -- function in a subquery
     EXECUTE
         $sql$
         SELECT (result).*
         FROM (
             SELECT
                 MADLIB_SCHEMA.$sql$ || fnName || $sql$(_madlib_state) AS result
                 FROM _madlib_iterative_alg
                 WHERE _madlib_iteration = $sql$ || theIteration || $sql$
             ) subq
         $sql$
         INTO theResult;
     -- The number of iterations are not updated in the C++ code. We do it here.
     IF NOT (theResult IS NULL) THEN
         theResult.num_iterations = theIteration;
     END IF;
     RETURN theResult;
 END;
 $$ LANGUAGE plpgsql VOLATILE;


 CREATE FUNCTION MADLIB_SCHEMA.mlogregr
 (
     source      VARCHAR,
     depvar      VARCHAR,
     indepvar    VARCHAR
 )
 RETURNS MADLIB_SCHEMA.mlogregr_result AS
 $$
     SELECT MADLIB_SCHEMA.mlogregr($1, $2, $3, 20, 'irls', 0.0001, 0);
 $$ LANGUAGE sql VOLATILE;

 CREATE FUNCTION MADLIB_SCHEMA.mlogregr(
     source              VARCHAR,
     depvar              VARCHAR,
     indepvar            VARCHAR,
     maxnumiterations    INTEGER
 )
 RETURNS MADLIB_SCHEMA.mlogregr_result AS
 $$
     SELECT MADLIB_SCHEMA.mlogregr($1, $2, $3, $4, 'irls', 0.0001, 0);
 $$ LANGUAGE sql VOLATILE;

 CREATE FUNCTION MADLIB_SCHEMA.mlogregr(
     source              VARCHAR,
     depvar              VARCHAR,
     indepvar            VARCHAR,
     maxbumiterations    INTEGER,
     optimizer           VARCHAR
 )
 RETURNS MADLIB_SCHEMA.mlogregr_result AS
 $$
     SELECT MADLIB_SCHEMA.mlogregr($1, $2, $3, $4, $5, 0.0001, 0);
 $$ LANGUAGE sql VOLATILE;
	/* ----------------------------------------------------------------------- //*
	*
	* @file multilogistic.sql_in
	*
	* @brief SQL functions for multinomial logistic regression
	* @date July 2012
	*
	* @sa For a brief introduction to multinomial logistic regression, see the
	* module description \ref grp_mlogreg.
	*
	// ----------------------------------------------------------------------- */

	m4_include(`SQLCommon.m4')

	/**
	@addtogroup grp_mlogreg

	@about

	Multinomial logistic regression is a widely used regression analysis tool that models the outcomes of categorical dependent
	random variables (denoted \f$ Y \in \{ 0,1,2 \ldots k \} \f$). The models assumes that the conditional mean of the
	dependant categorical variables is the logistic function of an affine combination of independent
	variables (usually denoted \f$ \boldsymbol x \f$). That is,
	\f[
	E[Y \mid \boldsymbol x] = \sigma(\boldsymbol c^T \boldsymbol x)
	\f]
	for some unknown vector of coefficients \f$ \boldsymbol c \f$ and where
	\f$ \sigma(x) = \frac{1}{1 + \exp(-x)} \f$ is the logistic function. Multinomial Logistic
	regression finds the vector of coefficients \f$ \boldsymbol c \f$ that maximizes
	the likelihood of the observations.

	Let
	- \f$ \boldsymbol y \in \{ 0,1 \}^{n \times k} \f$ denote the vector of observed dependent
	variables, with \f$ n \f$ rows and \f$ k \f$ columns, containing the observed values of the
	dependent variable,
	- \f$ X \in \mathbf R^{n \times k} \f$ denote the design matrix with \f$ k \f$
	columns and \f$ n \f$ rows, containing all observed vectors of independent
	variables \f$ \boldsymbol x_i \f$ as rows.

	By definition,
	\f[
	P[Y = y_i \| \boldsymbol x_i]
	= \sigma((-1)^{y_i} \cdot \boldsymbol c^T \boldsymbol x_i)
	\,.
	\f]
	Maximizing the likelihood
	\f$ \prod_{i=1}^n \Pr(Y = y_i \mid \boldsymbol x_i) \f$
	is equivalent to maximizing the log-likelihood
	\f$ \sum_{i=1}^n \log \Pr(Y = y_i \mid \boldsymbol x_i) \f$, which simplifies to
	\f[
	l(\boldsymbol c) =
	-\sum_{i=1}^n \log(1 + \exp((-1)^{y_i}
	\cdot \boldsymbol c^T \boldsymbol x_i))
	\,.
	\f]
	The Hessian of this objective is \f$ H = -X^T A X \f$ where
	\f$ A = \text{diag}(a_1, \dots, a_n) \f$ is the diagonal matrix with
	\f$
	a_i = \sigma(\boldsymbol c^T \boldsymbol x)
	\cdot
	\sigma(-\boldsymbol c^T \boldsymbol x)
	\,.
	\f$
	Since \f$ H \f$ is non-positive definite, \f$ l(\boldsymbol c) \f$ is convex.
	There are many techniques for solving convex optimization problems. Currently,
	logistic regression in MADlib can use:
	- Iteratively Reweighted Least Squares

	We estimate the standard error for coefficient \f$ i \f$ as
	\f[
	\mathit{se}(c_i) = \left( (X^T A X)^{-1} \right)_{ii}
	\,.
	\f]
	The Wald z-statistic is
	\f[
	z_i = \frac{c_i}{\mathit{se}(c_i)}
	\,.
	\f]

	The Wald \f$ p \f$-value for coefficient \f$ i \f$ gives the probability (under
	the assumptions inherent in the Wald test) of seeing a value at least as extreme
	as the one observed, provided that the null hypothesis (\f$ c_i = 0 \f$) is
	true. Letting \f$ F \f$ denote the cumulative density function of a standard
	normal distribution, the Wald \f$ p \f$-value for coefficient \f$ i \f$ is
	therefore
	\f[
	p_i = \Pr(\|Z\| \geq \|z_i\|) = 2 \cdot (1 - F( \|z_i\| ))
	\f]
	where \f$ Z \f$ is a standard normally distributed random variable.

	The odds ratio for coefficient \f$ i \f$ is estimated as \f$ \exp(c_i) \f$.

	The condition number is computed as \f$ \kappa(X^T A X) \f$ during the iteration
	immediately <em>preceding</em> convergence (i.e., \f$ A \f$ is computed using
	the coefficients of the previous iteration). A large condition number (say, more
	than 1000) indicates the presence of significant multicollinearity.

	The multinomial logistic regression uses a default reference category of zero,
	and the regression coefficients in the output are in the order described below.
	For a problem with
	\f$ K \f$ dependent variables \f$ (1, ..., K) \f$ and \f$ J \f$ categories \f$ (0, ..., J-1)
	\f$, let \f$ {m_{k,j}} \f$ denote the coefficient for dependent variable \f$ k
	\f$ and category \f$ j \f$. The output is \f$ {m_{k_1, j_0}, m_{k_1, j_1}
	\ldots m_{k_1, j_{J-1}}, m_{k_2, j_0}, m_{k_2, j_1}, \ldots m_{k_2, j_{J-1}} \ldots m_{k_K, j_{J-1}}} \f$.
	The order is NOT CONSISTENT with the multinomial regression marginal effect
	calculation with function <em>marginal_mlogregr</em>. This is deliberate
	because the interfaces of all multinomial regressions (robust, clustered, ...)
	will be moved to match that used in marginal.


	@input

	The training data is expected to be of the following form:\n
	<pre>{TABLE\|VIEW} <em>sourceName</em> (
	...
	<em>dependentVariable</em> INTEGER,
	<em>independentVariables</em> FLOAT8[],
	...
	)</pre>

	@usage
	- The number of independent variables cannot exceed 65535.
	- The reference category ranges from [0, numCategories-1]. The default reference
	category is zero.
	- Get vector of coefficients \f$ \boldsymbol c \f$ and all diagnostic
	statistics:\n
	<pre>SELECT * FROM \ref mlogregr(
	'<em>sourceName</em>',
	'<em>dependentVariable</em>',
	'<em>independentVariables</em>' [,
	<em>numberOfIterations</em> [,
	'<em>optimizer</em>' [,
	<em>precision</em>, [,
	<em>ref_category</em>]]]]
	);</pre>
	Output:
	<pre>
	ref_category \| coef \| log_likelihood \| std_err \| z_stats \| p_values \| odds_ratios \| condition_no \| num_iterations\n ----+------+----------------+---------+---------+----------+-------------+--------------+---------------
	...
	</pre>
	- Get vector of coefficients \f$ \boldsymbol c \f$:\n
	<pre>SELECT (\ref mlogregr('<em>sourceName</em>', '<em>dependentVariable</em>', '<em>independentVariables</em>')).coef;</pre>
	- Get a subset of the output columns, e.g., only the array of coefficients
	\f$ \boldsymbol c \f$, the log-likelihood of determination
	\f$ l(\boldsymbol c) \f$, and the array of p-values \f$ \boldsymbol p \f$:
	<pre>SELECT coef, log_likelihood, p_values
	FROM \ref mlogregr('<em>sourceName</em>', '<em>dependentVariable</em>', '<em>independentVariables</em>');</pre>

	Note that the categories are encoded as integers with values from {0, 1, 2,..., numCategories-1}
	@examp

	-# Create the sample data set:
	@verbatim
	sql> SELECT * FROM data;
	r1 \| val
	---------------------------------------------+-----
	{1,3.01789340097457,0.454183579888195} \| 1
	{1,-2.59380532894284,0.602678326424211} \| 0
	{1,-1.30643094424158,0.151587064377964} \| 1
	{1,3.60722299199551,0.963550757616758} \| 1
	{1,-1.52197745628655,0.0782248834148049} \| 1
	{1,-4.8746574902907,0.345104880165309} \| 0
	...
	@endverbatim
	-# Run the multi-logistic regression function:
	@verbatim
	sql> \x on
	Expanded display is off.
	sql> SELECT * FROM mlogregr('data', 'val', '2', 'r1', 100, 'irls', 0.001);
	-[ RECORD 1 ]--+--------------------------------------------------------------
	coef \| {5.59049410898112,2.11077546770772,-0.237276684606453}
	log_likelihood \| -467.214718489873
	std_err \| {0.318943457652178,0.101518723785383,0.294509929481773}
	z_stats \| {17.5281667482197,20.7919819024719,-0.805666162169712}
	p_values \| {8.73403463417837e-69,5.11539430631541e-96,0.420435365338518}
	odds_ratios \| {267.867942976278,8.2546400100702,0.788773016471171}
	condition_no \| 179.186118573205
	num_iterations \| 9

	@endverbatim

	@literature

	A collection of nice write-ups, with valuable pointers into
	further literature:

	[1] Annette J . Dobson: An Introduction to Generalized Linear Models, Second Edition. Nov 2001

	[2] Cosma Shalizi: Statistics 36-350: Data Mining, Lecture Notes, 18 November
	2009, http://www.stat.cmu.edu/~cshalizi/350/lectures/26/lecture-26.pdf

	[3] Srikrishna Sridhar, Mark Wellons, Caleb Welton: Multilogistic Regression:
	Notes and References, Jul 12 2012, http://www.cs.wisc.edu/~srikris/mlogit.pdf

	[4] Scott A. Czepiel: Maximum Likelihood Estimation
	of Logistic Regression Models: Theory and Implementation,
	Retrieved Jul 12 2012, http://czep.net/stat/mlelr.pdf



	@sa File multilogistic.sql_in (documenting the SQL functions)

	@internal
	@sa Namespace multilogistic (documenting the driver/outer loop implemented in
	Python), Namespace
	\ref madlib::modules::regress documenting the implementation in C++
	@endinternal

	*/


	DROP TYPE IF EXISTS MADLIB_SCHEMA.mlogregr_result;
	CREATE TYPE MADLIB_SCHEMA.mlogregr_result AS
	(
	ref_category INTEGER,
	coef DOUBLE PRECISION[],
	log_likelihood DOUBLE PRECISION,
	std_err DOUBLE PRECISION[],
	z_stats DOUBLE PRECISION[],
	p_values DOUBLE PRECISION[],
	odds_ratios DOUBLE PRECISION[],
	condition_no DOUBLE PRECISION,
	num_iterations INTEGER
	);


	CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__mlogregr_irls_step_transition
	(
	state DOUBLE PRECISION[],
	y INTEGER,
	num_categories INTEGER,
	ref_category INTEGER,
	x DOUBLE PRECISION[],
	prev_state DOUBLE PRECISION[]
	)
	RETURNS DOUBLE PRECISION[]
	AS 'MODULE_PATHNAME'
	LANGUAGE C IMMUTABLE;


	CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__mlogregr_irls_step_merge_states
	(
	state1 DOUBLE PRECISION[],
	state2 DOUBLE PRECISION[]
	)
	RETURNS DOUBLE PRECISION[]
	AS 'MODULE_PATHNAME'
	LANGUAGE C IMMUTABLE STRICT;


	CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__mlogregr_irls_step_final
	(
	state DOUBLE PRECISION[]
	)
	RETURNS DOUBLE PRECISION[]
	AS 'MODULE_PATHNAME'
	LANGUAGE C IMMUTABLE STRICT;


	/**
	* @internal
	* @brief Perform one iteration of the iteratively-reweighted-least-squares
	* method for computing linear regression
	*/
	CREATE AGGREGATE MADLIB_SCHEMA.__mlogregr_irls_step(
	/+ y / INTEGER,
	/+ numCategories / INTEGER,
	/+ ref_category / INTEGER,
	/+ x / DOUBLE PRECISION[],
	/+ previous_state / DOUBLE PRECISION[]) (

	STYPE=DOUBLE PRECISION[],
	SFUNC=MADLIB_SCHEMA.__mlogregr_irls_step_transition,
	m4_ifdef(`__GREENPLUM__',`prefunc=MADLIB_SCHEMA.__mlogregr_irls_step_merge_states,')
	FINALFUNC=MADLIB_SCHEMA.__mlogregr_irls_step_final,
	INITCOND='{0,0,0,0,0}'
	);


	CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__internal_mlogregr_irls_step_distance(
	/+ state1 / DOUBLE PRECISION[],
	/+ state2 / DOUBLE PRECISION[])
	RETURNS DOUBLE PRECISION AS
	'MODULE_PATHNAME'
	LANGUAGE c IMMUTABLE STRICT;

	CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__internal_mlogregr_irls_result(
	/+ state / DOUBLE PRECISION[])
	RETURNS MADLIB_SCHEMA.mlogregr_result AS
	'MODULE_PATHNAME'
	LANGUAGE c IMMUTABLE STRICT;


	-- We only need to document the last one (unfortunately, in Greenplum we have to
	-- use function overloading instead of default arguments).
	CREATE FUNCTION MADLIB_SCHEMA.__compute_mlogregr
	(
	source VARCHAR,
	depvar VARCHAR,
	indepvar VARCHAR,
	numcategories INTEGER,
	maxnumiterations INTEGER,
	optimizer VARCHAR,
	"precision" DOUBLE PRECISION,
	ref_category INTEGER
	)
	RETURNS INTEGER
	AS $$
	PythonFunction(regress, multilogistic, compute_mlogregr)
	$$ LANGUAGE plpythonu VOLATILE STRICT;

	/**
	* @brief Compute logistic-regression coefficients and diagnostic statistics
	*
	* To include an intercept in the model, set one coordinate in the
	* <tt>independentVariables</tt> array to 1.
	*
	* @param source Name of the source relation containing the training data
	* @param depvar Name of the dependent column (of type INTEGER < numcategories)
	* @param indepvar Name of the independent column (of type DOUBLE PRECISION[])
	* @param maxnumiterations The maximum number of iterations
	* @param optimizer The optimizer to use (
	* <tt>'irls'</tt>/<tt>'newton'</tt> for iteratively reweighted least
	* squares)
	* @param precision The difference between log-likelihood values in successive
	* iterations that should indicate convergence. Note that a non-positive
	* value here disables the convergence criterion, and execution will only
	* stop after \ maxNumIterations iterations.
	* @param ref_category The reference category specified by the user
	*
	* @return A composite value:
	* - <tt>ref_category INTEGER</tt> - Reference category
	* - <tt>coef FLOAT8[]</tt> - Array of coefficients, \f$ \boldsymbol c \f$
	* - <tt>log_likelihood FLOAT8</tt> - Log-likelihood \f$ l(\boldsymbol c) \f$
	* - <tt>std_err FLOAT8[]</tt> - Array of standard errors,
	* \f$ \mathit{se}(c_1), \dots, \mathit{se}(c_k) \f$
	* - <tt>z_stats FLOAT8[]</tt> - Array of Wald z-statistics, \f$ \boldsymbol z \f$
	* - <tt>p_values FLOAT8[]</tt> - Array of Wald p-values, \f$ \boldsymbol p \f$
	* - <tt>odds_ratios FLOAT8[]</tt>: Array of odds ratios,
	* \f$ \mathit{odds}(c_1), \dots, \mathit{odds}(c_k) \f$
	* - <tt>condition_no FLOAT8</tt> - The condition number of matrix
	* \f$ X^T A X \f$ during the iteration immediately <em>preceding</em>
	* convergence (i.e., \f$ A \f$ is computed using the coefficients of the
	* previous iteration)
	* - <tt>num_iterations INTEGER</tt> - The number of iterations before the
	* algorithm terminated
	*
	* @usage
	* - Get vector of coefficients \f$ \boldsymbol c \f$ and all diagnostic
	* statistics:\n
	* <pre>SELECT * FROM mlogregr('<em>sourceName</em>', '<em>dependentVariable</em>',
	* '<em>numCategories</em>', '<em>independentVariables</em>');</pre>
	* - Get vector of coefficients \f$ \boldsymbol c \f$:\n
	* <pre>SELECT (mlogregr('<em>sourceName</em>', '<em>dependentVariable</em>',
	* '<em>numCategories</em>', '<em>independentVariables</em>')).coef;</pre>
	* - Get a subset of the output columns, e.g., only the array of coefficients
	* \f$ \boldsymbol c \f$, the log-likelihood of determination
	* \f$ l(\boldsymbol c) \f$, and the array of p-values \f$ \boldsymbol p \f$:
	* <pre>SELECT coef, log_likelihood, p_values
	* FROM mlogregr('<em>sourceName</em>', '<em>dependentVariable</em>',
	* '<em>numCategories</em>', '<em>independentVariables</em>');</pre>
	*
	* @note This function starts an iterative algorithm. It is not an aggregate
	* function. Source and column names have to be passed as strings (due to
	* limitations of the SQL syntax).
	*
	* @internal
	* @sa This function is a wrapper for multilogistic::__compute_mlogregr(), which
	* sets the default values.
	*/
	CREATE FUNCTION MADLIB_SCHEMA.mlogregr
	(
	source VARCHAR,
	depvar VARCHAR,
	indepvar VARCHAR,
	maxnumiterations INTEGER /+ DEFAULT 20 /,
	optimizer VARCHAR /+ DEFAULT 'irls' /,
	"precision" DOUBLE PRECISION /+ DEFAULT 0.0001 /,
	ref_category INTEGER
	)
	RETURNS MADLIB_SCHEMA.mlogregr_result AS $$
	DECLARE
	observed_count INTEGER;
	theIteration INTEGER;
	fnName VARCHAR;
	theResult MADLIB_SCHEMA.mlogregr_result;
	numcategories INTEGER;
	min_category INTEGER;
	max_category INTEGER;
	BEGIN
	IF (source IS NULL OR trim(source) = '') THEN
	RAISE EXCEPTION 'Invalid source table given';
	END IF;

	IF (depvar IS NULL OR trim(depvar) = '') THEN
	RAISE EXCEPTION 'Invalid depvar given';
	END IF;
	IF (indepvar IS NULL OR trim(indepvar) = '') THEN
	RAISE EXCEPTION 'Invalid indepvar given';
	END IF;
	IF (maxnumiterations IS NULL OR maxnumiterations < 1) THEN
	RAISE EXCEPTION 'Number of max iterations must be positive';
	END IF;
	IF (optimizer IS NULL OR trim(optimizer) = '') THEN
	RAISE EXCEPTION 'Invalid optimizer given';
	END IF;
	IF (precision IS NULL) THEN
	RAISE EXCEPTION 'Invalid precision given.';
	END IF;
	IF (ref_category IS NULL OR ref_category < 0) THEN
	RAISE EXCEPTION 'Invalid ref_category given.';
	END IF;

	IF (SELECT atttypid::regtype <> 'INTEGER'::regtype
	FROM pg_attribute
	WHERE attrelid = source::regclass AND attname = depvar) THEN
	RAISE EXCEPTION 'The dependent variable column should be of type INTEGER';
	END IF;

	EXECUTE $sql$ SELECT count(DISTINCT $sql$ \|\| depvar \|\| $sql$ )
	FROM $sql$ \|\| textin(regclassout(source))
	INTO observed_count;
	numcategories := observed_count;

	EXECUTE $sql$ SELECT max($sql$ \|\| depvar \|\| $sql$ )
	FROM $sql$ \|\| textin(regclassout(source))
	INTO max_category;
	EXECUTE $sql$ SELECT min($sql$ \|\| depvar \|\| $sql$ )
	FROM $sql$ \|\| textin(regclassout(source))
	INTO min_category;

	IF max_category != numcategories - 1 OR min_category != 0 THEN
	RAISE EXCEPTION 'The value of the dependent variable should be in the
	range of [0, %)', numcategories;
	END IF;

	IF ref_category > numcategories -1 OR ref_category < 0 THEN
	RAISE EXCEPTION 'The value of the reference category should be in the
	range of [0, "%")', numcategories;
	END IF;

	IF optimizer = 'irls' OR optimizer = 'newton' THEN
	fnName := '__internal_mlogregr_irls_result';
	ELSE
	RAISE EXCEPTION 'Unknown optimizer (''%''). Must be "newton" or "irls"', optimizer;
	END IF;

	theIteration := (
	SELECT MADLIB_SCHEMA.__compute_mlogregr(
	$1, $2, $3, numcategories, $4, $5, $6, $7)
	);

	-- Because of Greenplum bug MPP-10050, we have to use dynamic SQL (using
	-- EXECUTE) in the following
	-- Because of Greenplum bug MPP-6731, we have to hide the tuple-returning
	-- function in a subquery
	EXECUTE
	$sql$
	SELECT (result).*
	FROM (
	SELECT
	MADLIB_SCHEMA.$sql$ \|\| fnName \|\| $sql$(_madlib_state) AS result
	FROM _madlib_iterative_alg
	WHERE _madlib_iteration = $sql$ \|\| theIteration \|\| $sql$
	) subq
	$sql$
	INTO theResult;
	-- The number of iterations are not updated in the C++ code. We do it here.
	IF NOT (theResult IS NULL) THEN
	theResult.num_iterations = theIteration;
	END IF;
	RETURN theResult;
	END;
	$$ LANGUAGE plpgsql VOLATILE;


	CREATE FUNCTION MADLIB_SCHEMA.mlogregr
	(
	source VARCHAR,
	depvar VARCHAR,
	indepvar VARCHAR
	)
	RETURNS MADLIB_SCHEMA.mlogregr_result AS
	$$
	SELECT MADLIB_SCHEMA.mlogregr($1, $2, $3, 20, 'irls', 0.0001, 0);
	$$ LANGUAGE sql VOLATILE;

	CREATE FUNCTION MADLIB_SCHEMA.mlogregr(
	source VARCHAR,
	depvar VARCHAR,
	indepvar VARCHAR,
	maxnumiterations INTEGER
	)
	RETURNS MADLIB_SCHEMA.mlogregr_result AS
	$$
	SELECT MADLIB_SCHEMA.mlogregr($1, $2, $3, $4, 'irls', 0.0001, 0);
	$$ LANGUAGE sql VOLATILE;

	CREATE FUNCTION MADLIB_SCHEMA.mlogregr(
	source VARCHAR,
	depvar VARCHAR,
	indepvar VARCHAR,
	maxbumiterations INTEGER,
	optimizer VARCHAR
	)
	RETURNS MADLIB_SCHEMA.mlogregr_result AS
	$$
	SELECT MADLIB_SCHEMA.mlogregr($1, $2, $3, $4, $5, 0.0001, 0);
	$$ LANGUAGE sql VOLATILE;