methods/regress/src/pg_gp/regression.sql_in - madlib - Git at Google

 /* ----------------------------------------------------------------------- *//**
  *
  * @file regression.sql_in
  *
  * @brief SQL functions for multi-linear and logistic regression
  * @date January 2011
  *
  * @sa For a brief introduction to multi-linear regression, see the module
  *     description \ref grp_linreg. Likewise, for logistic regression, see the
  *     module description \ref grp_logreg.
  *
  *//* --------------------------------------------------------------------------
  *
  * This file is preprocessed with m4. Macro expansion can be turned of by
  * enclosing text in <nom4> and </nom4>.
  */
 changequote(`<nom4>', `</nom4>')

 /**
 @addtogroup grp_linreg

 @about

 Linear regression refers to a stochastic model, in which the conditional mean
 of the dependent variable (usually denoted $y$) is an affine function of the
 vector of independent variables (usually denoted \f$ \boldsymbol x \f$):
 \f[
     E[y \mid \boldsymbol x] = \boldsymbol c^T \boldsymbol x
 \f]
 for some unknown vector of coefficients \f$ \boldsymbol c \f$.

 We fit the model with the ordinary-least-squares method. That is, the vector of
 regression coefficients \f$ \boldsymbol c \f$ is estimated as:
 \f[
     \boldsymbol c = (X^T X)^+ X^T \boldsymbol y = X^+ \boldsymbol y
 \f]
 where
 - $X$ is the design matrix with $k$ columns and $n$ rows, containing all
   observed vectors of independent variables \f$ \boldsymbol x_i \f$ as rows
 - $X^T$ denotes the transpose of $X$
 - $X^+$ dentoes the pseudo-inverse of $X$.
 Note: The identity \f$ X^+ = (X^T X)^+ X^T \f$ holds for all matrices $X$. A
 proof can be found, e.g., at:
 http://en.wikipedia.org/wiki/Proofs_involving_the_Moore%2DPenrose_pseudoinverse

 Computing the <b>total sum of squares</b> $TSS$, the <b>explained
 sum of squares</b> $ESS$ (also called the regression sum of
 squares), and the <b>residual sum of squares</b> $RSS$ (also called sum of
 squared residuals or sum of squared errors of prediction) is
 done according to the following formulas:
 \f[\begin{align*}
     ESS & = \boldsymbol y^T X \boldsymbol c
         -   \frac{ \| y \|_1^2 }{n} \\
     TSS & = \sum_{i=1}^k y_i^2
         -   \frac{ \| y \|_1^2 }{n} \\
     R^2 & = \frac{ESS}{TSS}
 \end{align*}\f]
 Note: The last equality follows from the definition
 \f$ R^2 = 1 - \frac{RSS}{TSS} \f$ and the fact that for linear regression
 $TSS = RSS + ESS$. A proof of the latter can be found, e.g., at:
 http://en.wikipedia.org/wiki/Sum_of_squares

 We estimate the variance
 \f$ Var[y - \boldsymbol c^T \boldsymbol x \mid \boldsymbol x] \f$ as
 \f[
     \sigma^2 = \frac{RSS}{n - k}
 \f]
 and compute the t-statistic for coefficient $i$ as
 \f[
     t_i = \frac{c_i}{\sqrt{\sigma^2 \cdot \left( (X^T X)^{-1} \right)_{ii} }}
 \f]

 The $p$-value for coefficient $i$ gives the probability that the null hypothesis
 ($c_i = 0$) is false, i.e., the probability that $c_i$ differs significantly
 from 0. Letting \f$ F_\nu \f$ denote the cumulative density function of
 student-t with \f$ \nu \f$ degrees of freedom, the $p$-value for coefficient $i$
 is therefore
 \f[
     p_i = P(|T| \geq |t_i|) = 2 \cdot (1 - F_{n - k}( |t_i| ))
 \f]
 where $T$ is a student-t distributed random variable with mean 0.

 @prereq

 Implemented in C for PostgreSQL/Greenplum.

 @usage

 -# The data set is expected to be of the following form:\n
    <tt>{TABLE|VIEW} <em>sourceName</em> ([...] <em>dependentVariable</em>
    DOUBLE PRECISION, <em>independentVariables</em> DOUBLE PRECISION[],
    [...])</tt>
 -# Run the linear regression by:\n
    <tt>SELECT mregr_coef(<em>dependentVariable</em>,
    <em>independentVariables</em>) FROM <em>sourceName</em></tt>\n
    Note: In order to model an intercept, set one coordinate in the
    <tt>independentVariables</tt> array to 1. (See below for an example.)
    \n
 -# The coefficient of determination (also denoted $R^2$), the vector of
    t-statistics, and the vector of p-values can be determined likewise by
    mregr_r2(), mregr_tstats(), mregr_pvalues().

 @examp

 The following examples is taken from
 http://www.stat.columbia.edu/~martin/W2110/SAS_7.pdf.

 @verbatim
 # select * from houses;
  id | tax  | bedroom | bath | price  | size |  lot
 ----+------+---------+------+--------+------+-------
   1 |  590 |       2 |    1 |  50000 |  770 | 22100
   2 | 1050 |       3 |    2 |  85000 | 1410 | 12000
   3 |   20 |       3 |    1 |  22500 | 1060 |  3500
   4 |  870 |       2 |    2 |  90000 | 1300 | 17500
   5 | 1320 |       3 |    2 | 133000 | 1500 | 30000
   6 | 1350 |       2 |    1 |  90500 |  820 | 25700
   7 | 2790 |       3 |  2.5 | 260000 | 2130 | 25000
   8 |  680 |       2 |    1 | 142500 | 1170 | 22000
   9 | 1840 |       3 |    2 | 160000 | 1500 | 19000
  10 | 3680 |       4 |    2 | 240000 | 2790 | 20000
  11 | 1660 |       3 |    1 |  87000 | 1030 | 17500
  12 | 1620 |       3 |    2 | 118600 | 1250 | 20000
  13 | 3100 |       3 |    2 | 140000 | 1760 | 38000
  14 | 2070 |       2 |    3 | 148000 | 1550 | 14000
  15 |  650 |       3 |  1.5 |  65000 | 1450 | 12000
 (15 rows)

 # select mregr_coef(price, array[1, bedroom, bath, size])::REAL[] from houses;
              mregr_coef
 ------------------------------------
  {27923.4,-35524.8,2269.34,130.794}
 (1 row)

 # select mregr_r2(price, array[1, bedroom, bath, size])::REAL from houses;
  mregr_r2
 ----------
  0.745374
 (1 row)

 # select mregr_tstats(price, array[1, bedroom, bath, size])::REAL[] from houses;
              mregr_tstats
 --------------------------------------
  {0.495919,-1.41891,0.102183,3.61223}
 (1 row)

 # select mregr_pvalues(price, array[1, bedroom, bath, size])::REAL[] from houses;
               mregr_pvalues
 -----------------------------------------
  {0.629711,0.183633,0.920451,0.00408159}
 (1 row)
 @endverbatim

 @sa file regression.sql_in (documenting the SQL functions)

 @internal
 @sa file regress.c (documenting the implementation in C), function
     float8_mregr_compute() (documenting the formulas used for coefficients,
     $R^2$, t-statistics, and p-values, implemented in C)
 @endinternal

 @literature

 [1] Cosma Shalizi: Statistics 36-350: Data Mining, Lecture Notes, 21 October
     2009, http://www.stat.cmu.edu/~cshalizi/350/lectures/17/lecture-17.pdf

 */

 CREATE FUNCTION MADLIB_SCHEMA.float8_mregr_accum(state DOUBLE PRECISION[], y DOUBLE PRECISION, x DOUBLE PRECISION[])
 RETURNS DOUBLE PRECISION[]
 AS 'MODULE_PATHNAME'
 LANGUAGE C
 IMMUTABLE STRICT;

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.float8_mregr_combine(state1 DOUBLE PRECISION[], state2 DOUBLE PRECISION[])
 RETURNS DOUBLE PRECISION[]
 AS 'MODULE_PATHNAME'
 LANGUAGE C
 IMMUTABLE STRICT;

 -- Final functions
 CREATE FUNCTION MADLIB_SCHEMA.float8_mregr_coef(DOUBLE PRECISION[])
 RETURNS DOUBLE PRECISION[]
 AS 'MODULE_PATHNAME'
 LANGUAGE C STRICT;

 CREATE FUNCTION MADLIB_SCHEMA.float8_mregr_r2(DOUBLE PRECISION[])
 RETURNS DOUBLE PRECISION
 AS 'MODULE_PATHNAME'
 LANGUAGE C STRICT;

 CREATE FUNCTION MADLIB_SCHEMA.float8_mregr_tstats(DOUBLE PRECISION[])
 RETURNS DOUBLE PRECISION[]
 AS 'MODULE_PATHNAME'
 LANGUAGE C STRICT;

 CREATE FUNCTION MADLIB_SCHEMA.float8_mregr_pvalues(DOUBLE PRECISION[])
 RETURNS DOUBLE PRECISION[]
 AS 'MODULE_PATHNAME'
 LANGUAGE C STRICT;


 /**
  * @brief Compute multi-linear regression coefficients.
  *
  * To include an intercept in the model, set one coordinate in the
  * <tt>independentVariables</tt> array to 1.
  *
  * @return Array of coefficients, which has the same length as the array of
  *      independent variables.
  *
  * @examp <tt>SELECT mregr_coef(y, [1, x1, x2]) FROM data;</tt>
  */
 CREATE AGGREGATE MADLIB_SCHEMA.mregr_coef(
     /*+ "dependentVariable" */ DOUBLE PRECISION,
     /*+ "independentVariables" */ DOUBLE PRECISION[]) (

     SFUNC=MADLIB_SCHEMA.float8_mregr_accum,
     STYPE=float8[],
     FINALFUNC=MADLIB_SCHEMA.float8_mregr_coef,
     ifdef(<nom4>GREENPLUM</nom4>,<nom4>prefunc=MADLIB_SCHEMA.float8_mregr_combine,</nom4>)
     INITCOND='{0}'
 );

 /**
  * @brief Compute the coefficient of determination, $R^2$.
  */
 CREATE AGGREGATE MADLIB_SCHEMA.mregr_r2(
     /*+ "dependentVariable" */ DOUBLE PRECISION,
     /*+ "independentVariables" */ DOUBLE PRECISION[]) (

     SFUNC=MADLIB_SCHEMA.float8_mregr_accum,
     STYPE=float8[],
     FINALFUNC=MADLIB_SCHEMA.float8_mregr_r2,
     ifdef(<nom4>GREENPLUM</nom4>,<nom4>prefunc=MADLIB_SCHEMA.float8_mregr_combine,</nom4>)
     INITCOND='{0}'
 );

 /**
  * @brief Compute the vector of t-statistics, for every coefficient.
  *
  * To include an intercept in the model, set one coordinate in the
  * independentVariables array to 1.
  *
  * @param dependentVariable Dependent variable
  * @param independentVariables Array of independent variables
  * @return Array of t-statistics for each coefficient. The returned array has
  *      the same length as the array of independent variables.
  */
 CREATE AGGREGATE MADLIB_SCHEMA.mregr_tstats(
     /*+ "dependentVariable" */ DOUBLE PRECISION,
     /*+ "independentVariables" */ DOUBLE PRECISION[]) (

     SFUNC=MADLIB_SCHEMA.float8_mregr_accum,
     STYPE=float8[],
     FINALFUNC=MADLIB_SCHEMA.float8_mregr_tstats,
     ifdef(<nom4>GREENPLUM</nom4>,<nom4>prefunc=MADLIB_SCHEMA.float8_mregr_combine,</nom4>)
     INITCOND='{0}'
 );

 /**
  * @brief Compute the vector of p-values, for every coefficient.
  *
  * @param dependentVariable Dependent variable
  * @param independentVariables Array of independent variables
  * @return Array of p-values for each coefficient. The returned array has
  *      the same length as the array of independent variables.
  */
 CREATE AGGREGATE MADLIB_SCHEMA.mregr_pvalues(
     /*+ "dependentVariable" */ DOUBLE PRECISION,
     /*+ "independentVariables" */ DOUBLE PRECISION[]) (

     SFUNC=MADLIB_SCHEMA.float8_mregr_accum,
     STYPE=float8[],
     FINALFUNC=MADLIB_SCHEMA.float8_mregr_pvalues,
     ifdef(<nom4>GREENPLUM</nom4>,<nom4>prefunc=MADLIB_SCHEMA.float8_mregr_combine,</nom4>)
     INITCOND='{0}'
 );

 /**
  * @brief Student-t cumulative distribution function.
  *
  * @param nu Degree of freedom >= 1.
  * @param x
  */
 CREATE FUNCTION MADLIB_SCHEMA.student_t_cdf(nu INTEGER, x DOUBLE PRECISION)
 RETURNS DOUBLE PRECISION
 AS 'MODULE_PATHNAME'
 LANGUAGE C
 IMMUTABLE STRICT;


 /**
 @addtogroup grp_logreg

 @about

 Logistic regression is used to estimate probabilities of a dependent binary
 variable, by fitting a stochastic model. It is one of the most commonly used
 tools for applied statistics and data mining [1].

 Logistic regression assumes a generalized linear model:
 \f[
     E[Y] = g^{-1}(\boldsymbol c^T X)
 \f]
 where:
 - $Y$ is the dependent variable
 - \f$\boldsymbol c^T X\f$ is the linear predictor
 - \f$g(x) = \ln\left( \frac{x}{1-x} \right)\f$ is the link function, with
   inverse \f$\sigma(x) := g^{-1}(x) = \frac{1}{1 + \exp(-x)} \f$

 For each training data point $i$, we have a vector of
 features $x_i$ and an observed class $y_i$. For ease of notation, let $Z$ be a
 dependent random variable such that $Z = -1$ if $Y = 0$ and $Z = 1$ if $Y = 1$,
 i.e., \f$Z := 2(Y - \frac 12)\f$. By definition,
 \f$P[Z = z_i | X = x_i] = σ(z_i \cdot \boldsymbol c^T x_i)\f$.

 Since logistic regression predicts probabilities, we can do maximum-likelihood
 fitting: That is, we want the vector of regression coefficients
 \f$\boldsymbol c\f$ to maximize
 \f[
     \prod_{i=1}^n \sigma(z_i \cdot \boldsymbol c^T \boldsymbol x_i)
 \f]
 or, equivalently, to maximize the objective
 \f[
     l(\boldsymbol c) =
         -\sum_{i=1}^n \ln(1 + \exp(-z_i \cdot \boldsymbol c^T \boldsymbol x_i))
 \f]
 By looking at the Hessian, we can verify that \f$l(\boldsymbol c)\f$ is convex.

 There are many techniques for solving convex optimization problems. Currently,
 logistic regression in MADlib can use one of two algorithms:
 - Iteratively Reweighted Least Squares
 - A conjugate-gradient approach, also known as Fletcher-Reeves method in the
   literature, where we use the Hestenes-Stiefel rule for calculating the step
   size.


 @prereq

 Implemented in C (the computation) and Python (the driver/outer loop) for
 PostgreSQL/Greenplum.

 @usage

 -# The training data is expected to be of the following form:\n
    <tt>{TABLE|VIEW} <em>sourceName</em> ([...] <em>dependentVariable</em>
    BOOLEAN, <em>independentVariables</em> DOUBLE PRECISION[], [...])</tt>
 -# Run the logistic regression by:\n
    <tt>SELECT logregr_coef('<em>sourceName</em>', '<em>dependentVariable</em>',
    '<em>independentVariables</em>', <em>numIterations</em>,
    '<em>optimizer</em>', <em>precision</em>)</tt>\n
    The last three arguments are optional and can be omitted, in which case
    default values will be used. See logregr_coef().\n
    Note: In order to model an intercept, set one coordinate in the
    <tt>independentVariables</tt> array to 1.


 @examp

 @verbatim
 # select * from artificiallogreg;
  y |                           x
 ---+-------------------------------------------------------
  t | {-1.19845,1.15366,0.941779,-0.23669,-0.711024}
  f | {-0.0680205,-0.716733,-0.149781,-0.410448,-0.0843123}
  f | {-0.330021,0.222596,-0.976091,0.773816,-1.06238}
  f | {0.648293,0.286225,0.524144,-0.141286,-1.41774}
  f | {0.859484,-0.412929,-0.273368,-0.243059,0.714789}
 [...]

 # select madlib.logregr_coef(
 #     'artificiallogreg', 'y', 'x', 20, 'irls', 0.001
 # )::REAL[];
                 logregr_coef
 ---------------------------------------------
  {-3.0307,3.63312,0.714105,-1.72496,1.37484}
 @endverbatim

 @sa file regression.sql_in (documenting the SQL functions)

 @internal
 @sa namespace logRegress (documenting the driver/outer loop implemented in
     Python), function float8_cg_update_final() (documenting the
     conjugate-gradient update/iteration steps, implemented in C), function
     float8_cg_update_accum() (documenting the
     iteratively-reweighted-least-squares update/iteration steps, implemented in
     C)
 @endinternal

 @literature

 A somewhat random selection of nice write-ups, with valuable pointers into
 further literature:

 [1] Cosma Shalizi: Statistics 36-350: Data Mining, Lecture Notes, 18 November
     2009, http://www.stat.cmu.edu/~cshalizi/350/lectures/26/lecture-26.pdf

 [2] Thomas P. Minka: A comparison of numerical optimizers for logistic
     regression, 2003 (revised Mar 26, 2007),
     http://research.microsoft.com/en-us/um/people/minka/papers/logreg/minka-logreg.pdf

 [3] Paul Komarek, Andrew W. Moore: Making Logistic Regression A Core Data Mining
     Tool With TR-IRLS, IEEE International Conference on Data Mining 2005,
     pp. 685-688, http://komarix.org/ac/papers/tr-irls.short.pdf
 */

 CREATE TYPE MADLIB_SCHEMA.logregr_cg_state AS (
     iteration       INTEGER,
     len             INTEGER,
     coef            DOUBLE PRECISION[],
     dir             DOUBLE PRECISION[],
     grad            DOUBLE PRECISION[],
     beta            DOUBLE PRECISION,

     count           BIGINT,
     gradNew         DOUBLE PRECISION[],
     dTHd            DOUBLE PRECISION,
     logLikelihood   DOUBLE PRECISION
 );

 CREATE TYPE MADLIB_SCHEMA.logregr_irls_state AS (
     coef            DOUBLE PRECISION[],
     logLikelihood   DOUBLE PRECISION
 );

 CREATE FUNCTION MADLIB_SCHEMA.float8_cg_update_accum(
     MADLIB_SCHEMA.logregr_cg_state,
     BOOLEAN,
     DOUBLE PRECISION[],
     MADLIB_SCHEMA.logregr_cg_state)
 RETURNS MADLIB_SCHEMA.logregr_cg_state
 AS 'MODULE_PATHNAME'
 LANGUAGE C;

 CREATE FUNCTION MADLIB_SCHEMA.float8_irls_update_accum(
     DOUBLE PRECISION[],
     BOOLEAN,
     DOUBLE PRECISION[],
     MADLIB_SCHEMA.logregr_irls_state)
 RETURNS DOUBLE PRECISION[]
 AS 'MODULE_PATHNAME'
 LANGUAGE C;

 CREATE FUNCTION MADLIB_SCHEMA.float8_cg_update_final(MADLIB_SCHEMA.logregr_cg_state)
 RETURNS MADLIB_SCHEMA.logregr_cg_state
 AS 'MODULE_PATHNAME'
 LANGUAGE C STRICT;

 CREATE FUNCTION MADLIB_SCHEMA.float8_irls_update_final(DOUBLE PRECISION[])
 RETURNS MADLIB_SCHEMA.logregr_irls_state
 AS 'MODULE_PATHNAME'
 LANGUAGE C STRICT;

 /**
  * @internal
  * @brief Perform one iteration of the conjugate-gradient method for computing
  *        logistic regression
  */
 CREATE AGGREGATE MADLIB_SCHEMA.logregr_cg_step(
     BOOLEAN,
     DOUBLE PRECISION[],
     MADLIB_SCHEMA.logregr_cg_state) (

     SFUNC=MADLIB_SCHEMA.float8_cg_update_accum,
     STYPE=MADLIB_SCHEMA.logregr_cg_state,
     FINALFUNC=MADLIB_SCHEMA.float8_cg_update_final
 );

 /**
  * @internal
  * @brief Perform one iteration of the iteratively-reweighted-least-squares
  *        method for computing linear regression
  */
 CREATE AGGREGATE MADLIB_SCHEMA.logregr_irls_step(
     BOOLEAN,
     DOUBLE PRECISION[],
     MADLIB_SCHEMA.logregr_irls_state) (

     SFUNC=MADLIB_SCHEMA.float8_irls_update_accum,
     STYPE=float8[],
     PREFUNC=MADLIB_SCHEMA.float8_mregr_combine,
     FINALFUNC=MADLIB_SCHEMA.float8_irls_update_final,
     INITCOND='{0}'
 );

 CREATE FUNCTION MADLIB_SCHEMA.logregr_should_terminate(
     DOUBLE PRECISION[],
     DOUBLE PRECISION[],
     VARCHAR,
     DOUBLE PRECISION)
 RETURNS BOOLEAN
 AS 'MODULE_PATHNAME'
 LANGUAGE C STRICT;

 -- begin functions for logistic-regression coefficients
 -- We only need to document the last one (unfortunately, in Greenplum we have to
 -- use function overloading instead of default arguments).
 CREATE FUNCTION MADLIB_SCHEMA.logregr_coef(
     "source" VARCHAR,
     "depColumn" VARCHAR,
     "indepColumn" VARCHAR)
 RETURNS DOUBLE PRECISION[] AS $$
     import sys
     try:
         ifdef(<nom4>DEBUG</nom4>,,<nom4>from madlib </nom4>)import logRegress
     except:
         sys.path.append("PLPYTHON_LIBDIR")
         ifdef(<nom4>DEBUG</nom4>,,<nom4>from madlib </nom4>)import logRegress

     return logRegress.compute_logregr_coef(**globals())
 $$ LANGUAGE plpythonu VOLATILE;

 CREATE FUNCTION MADLIB_SCHEMA.logregr_coef(
     "source" VARCHAR,
     "depColumn" VARCHAR,
     "indepColumn" VARCHAR,
     "numIterations" INTEGER)
 RETURNS DOUBLE PRECISION[] AS $$
     import sys
     try:
         ifdef(<nom4>DEBUG</nom4>,,<nom4>from madlib </nom4>)import logRegress
     except:
         sys.path.append("PLPYTHON_LIBDIR")
         ifdef(<nom4>DEBUG</nom4>,,<nom4>from madlib </nom4>)import logRegress

     return logRegress.compute_logregr_coef(**globals())
 $$ LANGUAGE plpythonu VOLATILE;

 CREATE FUNCTION MADLIB_SCHEMA.logregr_coef(
     "source" VARCHAR,
     "depColumn" VARCHAR,
     "indepColumn" VARCHAR,
     "numIterations" INTEGER,
     "optimizer" VARCHAR)
 RETURNS DOUBLE PRECISION[] AS $$
     import sys
     try:
         ifdef(<nom4>DEBUG</nom4>,,<nom4>from madlib </nom4>)import logRegress
     except:
         sys.path.append("PLPYTHON_LIBDIR")
         ifdef(<nom4>DEBUG</nom4>,,<nom4>from madlib </nom4>)import logRegress

     return logRegress.compute_logregr_coef(**globals())
 $$ LANGUAGE plpythonu VOLATILE;


 /**
  * @brief Compute logistic-regression coefficients
  *
  * To include an intercept in the model, set one coordinate in the
  * <tt>independentVariables</tt> array to 1.
  *
  * @param source Name of the source relation containing the training data
  * @param depColumn Name of the dependent column (of type BOOLEAN)
  * @param indepColumn Name of the independent column (of type DOUBLE
  *        PRECISION[])
  * @param numIterations The maximum number of iterations
  * @param optimizer The optimizer to use (either
  *        <tt>'ilrs'</tt>/<tt>'newton'</tt> for iteratively reweighted least
  *        squares or <tt>'cg'</tt> for conjugent gradient)
  * @param precision The difference between log-likelihood values in successive
  *        iterations that should indicate convergence, or 0 indicating that
  *        log-likelihood values should be ignored
  *
  * @note This function starts an iterative algorithm. It is not an aggregate
  *       function. Source and column names have to be passed as strings (due to
  *       limitations of the SQL syntax).
  *
  * @examp <tt>SELECT logregr_coef('data', 'y', 'array[1, x1, x2]', 20, 'cg',
  *        0.001);</tt>
  *
  * @internal
  * @sa This function is a wrapper for logRegress::compute_logregr_coef(), which
  *     sets the default values.
  */
 CREATE FUNCTION MADLIB_SCHEMA.logregr_coef(
     "source" VARCHAR,
     "depColumn" VARCHAR,
     "indepColumn" VARCHAR,
     "numIterations" INTEGER /*+ DEFAULT 20 */,
     "optimizer" VARCHAR /*+ DEFAULT 'irls' */,
     "precision" DOUBLE PRECISION /*+ DEFAULT 0.0001 */)
 RETURNS DOUBLE PRECISION[] AS $$
     import sys
     try:
         ifdef(<nom4>DEBUG</nom4>,,<nom4>from madlib </nom4>)import logRegress
     except:
         sys.path.append("PLPYTHON_LIBDIR")
         ifdef(<nom4>DEBUG</nom4>,,<nom4>from madlib </nom4>)import logRegress

     return logRegress.compute_logregr_coef(**globals())
 $$ LANGUAGE plpythonu VOLATILE;


 ifdef(<nom4>PGXS</nom4>,<nom4>
 /*
  * Function for initializing python paths to the paths in dynamic_library_path.
  * This is only needed when debugging Python-based functions without installing
  * them in a location where Python would find them automatically.
  */
 CREATE FUNCTION MADLIB_SCHEMA.init_python_paths()
 RETURNS VOID AS
 $$
     # FIXME: The following code should be common code and not reside in a specialized module
     import sys

     dyld_paths = plpy.execute(
         "SHOW dynamic_library_path")[0]["dynamic_library_path"].split(':')
     before_default = True
     count = 0
     for path in dyld_paths:
         if path == "$libdir":
             before_default = False
         else:
             if before_default:
                 sys.path.insert(count, path)
                 count += 1
             else:
                 sys.path.append(path)
 $$ LANGUAGE plpythonu VOLATILE;
 </nom4>)
	/* ----------------------------------------------------------------------- //*
	*
	* @file regression.sql_in
	*
	* @brief SQL functions for multi-linear and logistic regression
	* @date January 2011
	*
	* @sa For a brief introduction to multi-linear regression, see the module
	* description \ref grp_linreg. Likewise, for logistic regression, see the
	* module description \ref grp_logreg.
	*
	// --------------------------------------------------------------------------
	*
	* This file is preprocessed with m4. Macro expansion can be turned of by
	* enclosing text in <nom4> and </nom4>.
	*/
	changequote(`<nom4>', `</nom4>')

	/**
	@addtogroup grp_linreg

	@about

	Linear regression refers to a stochastic model, in which the conditional mean
	of the dependent variable (usually denoted $y$) is an affine function of the
	vector of independent variables (usually denoted \f$ \boldsymbol x \f$):
	\f[
	E[y \mid \boldsymbol x] = \boldsymbol c^T \boldsymbol x
	\f]
	for some unknown vector of coefficients \f$ \boldsymbol c \f$.

	We fit the model with the ordinary-least-squares method. That is, the vector of
	regression coefficients \f$ \boldsymbol c \f$ is estimated as:
	\f[
	\boldsymbol c = (X^T X)^+ X^T \boldsymbol y = X^+ \boldsymbol y
	\f]
	where
	- $X$ is the design matrix with $k$ columns and $n$ rows, containing all
	observed vectors of independent variables \f$ \boldsymbol x_i \f$ as rows
	- $X^T$ denotes the transpose of $X$
	- $X^+$ dentoes the pseudo-inverse of $X$.
	Note: The identity \f$ X^+ = (X^T X)^+ X^T \f$ holds for all matrices $X$. A
	proof can be found, e.g., at:
	http://en.wikipedia.org/wiki/Proofs_involving_the_Moore%2DPenrose_pseudoinverse

	Computing the <b>total sum of squares</b> $TSS$, the <b>explained
	sum of squares</b> $ESS$ (also called the regression sum of
	squares), and the <b>residual sum of squares</b> $RSS$ (also called sum of
	squared residuals or sum of squared errors of prediction) is
	done according to the following formulas:
	\f[\begin{align*}
	ESS & = \boldsymbol y^T X \boldsymbol c
	- \frac{ \\| y \\|_1^2 }{n} \\
	TSS & = \sum_{i=1}^k y_i^2
	- \frac{ \\| y \\|_1^2 }{n} \\
	R^2 & = \frac{ESS}{TSS}
	\end{align*}\f]
	Note: The last equality follows from the definition
	\f$ R^2 = 1 - \frac{RSS}{TSS} \f$ and the fact that for linear regression
	$TSS = RSS + ESS$. A proof of the latter can be found, e.g., at:
	http://en.wikipedia.org/wiki/Sum_of_squares

	We estimate the variance
	\f$ Var[y - \boldsymbol c^T \boldsymbol x \mid \boldsymbol x] \f$ as
	\f[
	\sigma^2 = \frac{RSS}{n - k}
	\f]
	and compute the t-statistic for coefficient $i$ as
	\f[
	t_i = \frac{c_i}{\sqrt{\sigma^2 \cdot \left( (X^T X)^{-1} \right)_{ii} }}
	\f]

	The $p$-value for coefficient $i$ gives the probability that the null hypothesis
	($c_i = 0$) is false, i.e., the probability that $c_i$ differs significantly
	from 0. Letting \f$ F_\nu \f$ denote the cumulative density function of
	student-t with \f$ \nu \f$ degrees of freedom, the $p$-value for coefficient $i$
	is therefore
	\f[
	p_i = P(\|T\| \geq \|t_i\|) = 2 \cdot (1 - F_{n - k}( \|t_i\| ))
	\f]
	where $T$ is a student-t distributed random variable with mean 0.

	@prereq

	Implemented in C for PostgreSQL/Greenplum.

	@usage

	-# The data set is expected to be of the following form:\n
	<tt>{TABLE\|VIEW} <em>sourceName</em> ([...] <em>dependentVariable</em>
	DOUBLE PRECISION, <em>independentVariables</em> DOUBLE PRECISION[],
	[...])</tt>
	-# Run the linear regression by:\n
	<tt>SELECT mregr_coef(<em>dependentVariable</em>,
	<em>independentVariables</em>) FROM <em>sourceName</em></tt>\n
	Note: In order to model an intercept, set one coordinate in the
	<tt>independentVariables</tt> array to 1. (See below for an example.)
	\n
	-# The coefficient of determination (also denoted $R^2$), the vector of
	t-statistics, and the vector of p-values can be determined likewise by
	mregr_r2(), mregr_tstats(), mregr_pvalues().

	@examp

	The following examples is taken from
	http://www.stat.columbia.edu/~martin/W2110/SAS_7.pdf.

	@verbatim
	# select * from houses;
	id \| tax \| bedroom \| bath \| price \| size \| lot
	----+------+---------+------+--------+------+-------
	1 \| 590 \| 2 \| 1 \| 50000 \| 770 \| 22100
	2 \| 1050 \| 3 \| 2 \| 85000 \| 1410 \| 12000
	3 \| 20 \| 3 \| 1 \| 22500 \| 1060 \| 3500
	4 \| 870 \| 2 \| 2 \| 90000 \| 1300 \| 17500
	5 \| 1320 \| 3 \| 2 \| 133000 \| 1500 \| 30000
	6 \| 1350 \| 2 \| 1 \| 90500 \| 820 \| 25700
	7 \| 2790 \| 3 \| 2.5 \| 260000 \| 2130 \| 25000
	8 \| 680 \| 2 \| 1 \| 142500 \| 1170 \| 22000
	9 \| 1840 \| 3 \| 2 \| 160000 \| 1500 \| 19000
	10 \| 3680 \| 4 \| 2 \| 240000 \| 2790 \| 20000
	11 \| 1660 \| 3 \| 1 \| 87000 \| 1030 \| 17500
	12 \| 1620 \| 3 \| 2 \| 118600 \| 1250 \| 20000
	13 \| 3100 \| 3 \| 2 \| 140000 \| 1760 \| 38000
	14 \| 2070 \| 2 \| 3 \| 148000 \| 1550 \| 14000
	15 \| 650 \| 3 \| 1.5 \| 65000 \| 1450 \| 12000
	(15 rows)

	# select mregr_coef(price, array[1, bedroom, bath, size])::REAL[] from houses;
	mregr_coef
	------------------------------------
	{27923.4,-35524.8,2269.34,130.794}
	(1 row)

	# select mregr_r2(price, array[1, bedroom, bath, size])::REAL from houses;
	mregr_r2
	----------
	0.745374
	(1 row)

	# select mregr_tstats(price, array[1, bedroom, bath, size])::REAL[] from houses;
	mregr_tstats
	--------------------------------------
	{0.495919,-1.41891,0.102183,3.61223}
	(1 row)

	# select mregr_pvalues(price, array[1, bedroom, bath, size])::REAL[] from houses;
	mregr_pvalues
	-----------------------------------------
	{0.629711,0.183633,0.920451,0.00408159}
	(1 row)
	@endverbatim

	@sa file regression.sql_in (documenting the SQL functions)

	@internal
	@sa file regress.c (documenting the implementation in C), function
	float8_mregr_compute() (documenting the formulas used for coefficients,
	$R^2$, t-statistics, and p-values, implemented in C)
	@endinternal

	@literature

	[1] Cosma Shalizi: Statistics 36-350: Data Mining, Lecture Notes, 21 October
	2009, http://www.stat.cmu.edu/~cshalizi/350/lectures/17/lecture-17.pdf

	*/

	CREATE FUNCTION MADLIB_SCHEMA.float8_mregr_accum(state DOUBLE PRECISION[], y DOUBLE PRECISION, x DOUBLE PRECISION[])
	RETURNS DOUBLE PRECISION[]
	AS 'MODULE_PATHNAME'
	LANGUAGE C
	IMMUTABLE STRICT;

	CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.float8_mregr_combine(state1 DOUBLE PRECISION[], state2 DOUBLE PRECISION[])
	RETURNS DOUBLE PRECISION[]
	AS 'MODULE_PATHNAME'
	LANGUAGE C
	IMMUTABLE STRICT;

	-- Final functions
	CREATE FUNCTION MADLIB_SCHEMA.float8_mregr_coef(DOUBLE PRECISION[])
	RETURNS DOUBLE PRECISION[]
	AS 'MODULE_PATHNAME'
	LANGUAGE C STRICT;

	CREATE FUNCTION MADLIB_SCHEMA.float8_mregr_r2(DOUBLE PRECISION[])
	RETURNS DOUBLE PRECISION
	AS 'MODULE_PATHNAME'
	LANGUAGE C STRICT;

	CREATE FUNCTION MADLIB_SCHEMA.float8_mregr_tstats(DOUBLE PRECISION[])
	RETURNS DOUBLE PRECISION[]
	AS 'MODULE_PATHNAME'
	LANGUAGE C STRICT;

	CREATE FUNCTION MADLIB_SCHEMA.float8_mregr_pvalues(DOUBLE PRECISION[])
	RETURNS DOUBLE PRECISION[]
	AS 'MODULE_PATHNAME'
	LANGUAGE C STRICT;


	/**
	* @brief Compute multi-linear regression coefficients.
	*
	* To include an intercept in the model, set one coordinate in the
	* <tt>independentVariables</tt> array to 1.
	*
	* @return Array of coefficients, which has the same length as the array of
	* independent variables.
	*
	* @examp <tt>SELECT mregr_coef(y, [1, x1, x2]) FROM data;</tt>
	*/
	CREATE AGGREGATE MADLIB_SCHEMA.mregr_coef(
	/+ "dependentVariable" / DOUBLE PRECISION,
	/+ "independentVariables" / DOUBLE PRECISION[]) (

	SFUNC=MADLIB_SCHEMA.float8_mregr_accum,
	STYPE=float8[],
	FINALFUNC=MADLIB_SCHEMA.float8_mregr_coef,
	ifdef(<nom4>GREENPLUM</nom4>,<nom4>prefunc=MADLIB_SCHEMA.float8_mregr_combine,</nom4>)
	INITCOND='{0}'
	);

	/**
	* @brief Compute the coefficient of determination, $R^2$.
	*/
	CREATE AGGREGATE MADLIB_SCHEMA.mregr_r2(
	/+ "dependentVariable" / DOUBLE PRECISION,
	/+ "independentVariables" / DOUBLE PRECISION[]) (

	SFUNC=MADLIB_SCHEMA.float8_mregr_accum,
	STYPE=float8[],
	FINALFUNC=MADLIB_SCHEMA.float8_mregr_r2,
	ifdef(<nom4>GREENPLUM</nom4>,<nom4>prefunc=MADLIB_SCHEMA.float8_mregr_combine,</nom4>)
	INITCOND='{0}'
	);

	/**
	* @brief Compute the vector of t-statistics, for every coefficient.
	*
	* To include an intercept in the model, set one coordinate in the
	* independentVariables array to 1.
	*
	* @param dependentVariable Dependent variable
	* @param independentVariables Array of independent variables
	* @return Array of t-statistics for each coefficient. The returned array has
	* the same length as the array of independent variables.
	*/
	CREATE AGGREGATE MADLIB_SCHEMA.mregr_tstats(
	/+ "dependentVariable" / DOUBLE PRECISION,
	/+ "independentVariables" / DOUBLE PRECISION[]) (

	SFUNC=MADLIB_SCHEMA.float8_mregr_accum,
	STYPE=float8[],
	FINALFUNC=MADLIB_SCHEMA.float8_mregr_tstats,
	ifdef(<nom4>GREENPLUM</nom4>,<nom4>prefunc=MADLIB_SCHEMA.float8_mregr_combine,</nom4>)
	INITCOND='{0}'
	);

	/**
	* @brief Compute the vector of p-values, for every coefficient.
	*
	* @param dependentVariable Dependent variable
	* @param independentVariables Array of independent variables
	* @return Array of p-values for each coefficient. The returned array has
	* the same length as the array of independent variables.
	*/
	CREATE AGGREGATE MADLIB_SCHEMA.mregr_pvalues(
	/+ "dependentVariable" / DOUBLE PRECISION,
	/+ "independentVariables" / DOUBLE PRECISION[]) (

	SFUNC=MADLIB_SCHEMA.float8_mregr_accum,
	STYPE=float8[],
	FINALFUNC=MADLIB_SCHEMA.float8_mregr_pvalues,
	ifdef(<nom4>GREENPLUM</nom4>,<nom4>prefunc=MADLIB_SCHEMA.float8_mregr_combine,</nom4>)
	INITCOND='{0}'
	);

	/**
	* @brief Student-t cumulative distribution function.
	*
	* @param nu Degree of freedom >= 1.
	* @param x
	*/
	CREATE FUNCTION MADLIB_SCHEMA.student_t_cdf(nu INTEGER, x DOUBLE PRECISION)
	RETURNS DOUBLE PRECISION
	AS 'MODULE_PATHNAME'
	LANGUAGE C
	IMMUTABLE STRICT;


	/**
	@addtogroup grp_logreg

	@about

	Logistic regression is used to estimate probabilities of a dependent binary
	variable, by fitting a stochastic model. It is one of the most commonly used
	tools for applied statistics and data mining [1].

	Logistic regression assumes a generalized linear model:
	\f[
	E[Y] = g^{-1}(\boldsymbol c^T X)
	\f]
	where:
	- $Y$ is the dependent variable
	- \f$\boldsymbol c^T X\f$ is the linear predictor
	- \f$g(x) = \ln\left( \frac{x}{1-x} \right)\f$ is the link function, with
	inverse \f$\sigma(x) := g^{-1}(x) = \frac{1}{1 + \exp(-x)} \f$

	For each training data point $i$, we have a vector of
	features $x_i$ and an observed class $y_i$. For ease of notation, let $Z$ be a
	dependent random variable such that $Z = -1$ if $Y = 0$ and $Z = 1$ if $Y = 1$,
	i.e., \f$Z := 2(Y - \frac 12)\f$. By definition,
	\f$P[Z = z_i \| X = x_i] = σ(z_i \cdot \boldsymbol c^T x_i)\f$.

	Since logistic regression predicts probabilities, we can do maximum-likelihood
	fitting: That is, we want the vector of regression coefficients
	\f$\boldsymbol c\f$ to maximize
	\f[
	\prod_{i=1}^n \sigma(z_i \cdot \boldsymbol c^T \boldsymbol x_i)
	\f]
	or, equivalently, to maximize the objective
	\f[
	l(\boldsymbol c) =
	-\sum_{i=1}^n \ln(1 + \exp(-z_i \cdot \boldsymbol c^T \boldsymbol x_i))
	\f]
	By looking at the Hessian, we can verify that \f$l(\boldsymbol c)\f$ is convex.

	There are many techniques for solving convex optimization problems. Currently,
	logistic regression in MADlib can use one of two algorithms:
	- Iteratively Reweighted Least Squares
	- A conjugate-gradient approach, also known as Fletcher-Reeves method in the
	literature, where we use the Hestenes-Stiefel rule for calculating the step
	size.


	@prereq

	Implemented in C (the computation) and Python (the driver/outer loop) for
	PostgreSQL/Greenplum.

	@usage

	-# The training data is expected to be of the following form:\n
	<tt>{TABLE\|VIEW} <em>sourceName</em> ([...] <em>dependentVariable</em>
	BOOLEAN, <em>independentVariables</em> DOUBLE PRECISION[], [...])</tt>
	-# Run the logistic regression by:\n
	<tt>SELECT logregr_coef('<em>sourceName</em>', '<em>dependentVariable</em>',
	'<em>independentVariables</em>', <em>numIterations</em>,
	'<em>optimizer</em>', <em>precision</em>)</tt>\n
	The last three arguments are optional and can be omitted, in which case
	default values will be used. See logregr_coef().\n
	Note: In order to model an intercept, set one coordinate in the
	<tt>independentVariables</tt> array to 1.


	@examp

	@verbatim
	# select * from artificiallogreg;
	y \| x
	---+-------------------------------------------------------
	t \| {-1.19845,1.15366,0.941779,-0.23669,-0.711024}
	f \| {-0.0680205,-0.716733,-0.149781,-0.410448,-0.0843123}
	f \| {-0.330021,0.222596,-0.976091,0.773816,-1.06238}
	f \| {0.648293,0.286225,0.524144,-0.141286,-1.41774}
	f \| {0.859484,-0.412929,-0.273368,-0.243059,0.714789}
	[...]

	# select madlib.logregr_coef(
	# 'artificiallogreg', 'y', 'x', 20, 'irls', 0.001
	# )::REAL[];
	logregr_coef
	---------------------------------------------
	{-3.0307,3.63312,0.714105,-1.72496,1.37484}
	@endverbatim

	@sa file regression.sql_in (documenting the SQL functions)

	@internal
	@sa namespace logRegress (documenting the driver/outer loop implemented in
	Python), function float8_cg_update_final() (documenting the
	conjugate-gradient update/iteration steps, implemented in C), function
	float8_cg_update_accum() (documenting the
	iteratively-reweighted-least-squares update/iteration steps, implemented in
	C)
	@endinternal

	@literature

	A somewhat random selection of nice write-ups, with valuable pointers into
	further literature:

	[1] Cosma Shalizi: Statistics 36-350: Data Mining, Lecture Notes, 18 November
	2009, http://www.stat.cmu.edu/~cshalizi/350/lectures/26/lecture-26.pdf

	[2] Thomas P. Minka: A comparison of numerical optimizers for logistic
	regression, 2003 (revised Mar 26, 2007),
	http://research.microsoft.com/en-us/um/people/minka/papers/logreg/minka-logreg.pdf

	[3] Paul Komarek, Andrew W. Moore: Making Logistic Regression A Core Data Mining
	Tool With TR-IRLS, IEEE International Conference on Data Mining 2005,
	pp. 685-688, http://komarix.org/ac/papers/tr-irls.short.pdf
	*/

	CREATE TYPE MADLIB_SCHEMA.logregr_cg_state AS (
	iteration INTEGER,
	len INTEGER,
	coef DOUBLE PRECISION[],
	dir DOUBLE PRECISION[],
	grad DOUBLE PRECISION[],
	beta DOUBLE PRECISION,

	count BIGINT,
	gradNew DOUBLE PRECISION[],
	dTHd DOUBLE PRECISION,
	logLikelihood DOUBLE PRECISION
	);

	CREATE TYPE MADLIB_SCHEMA.logregr_irls_state AS (
	coef DOUBLE PRECISION[],
	logLikelihood DOUBLE PRECISION
	);

	CREATE FUNCTION MADLIB_SCHEMA.float8_cg_update_accum(
	MADLIB_SCHEMA.logregr_cg_state,
	BOOLEAN,
	DOUBLE PRECISION[],
	MADLIB_SCHEMA.logregr_cg_state)
	RETURNS MADLIB_SCHEMA.logregr_cg_state
	AS 'MODULE_PATHNAME'
	LANGUAGE C;

	CREATE FUNCTION MADLIB_SCHEMA.float8_irls_update_accum(
	DOUBLE PRECISION[],
	BOOLEAN,
	DOUBLE PRECISION[],
	MADLIB_SCHEMA.logregr_irls_state)
	RETURNS DOUBLE PRECISION[]
	AS 'MODULE_PATHNAME'
	LANGUAGE C;

	CREATE FUNCTION MADLIB_SCHEMA.float8_cg_update_final(MADLIB_SCHEMA.logregr_cg_state)
	RETURNS MADLIB_SCHEMA.logregr_cg_state
	AS 'MODULE_PATHNAME'
	LANGUAGE C STRICT;

	CREATE FUNCTION MADLIB_SCHEMA.float8_irls_update_final(DOUBLE PRECISION[])
	RETURNS MADLIB_SCHEMA.logregr_irls_state
	AS 'MODULE_PATHNAME'
	LANGUAGE C STRICT;

	/**
	* @internal
	* @brief Perform one iteration of the conjugate-gradient method for computing
	* logistic regression
	*/
	CREATE AGGREGATE MADLIB_SCHEMA.logregr_cg_step(
	BOOLEAN,
	DOUBLE PRECISION[],
	MADLIB_SCHEMA.logregr_cg_state) (

	SFUNC=MADLIB_SCHEMA.float8_cg_update_accum,
	STYPE=MADLIB_SCHEMA.logregr_cg_state,
	FINALFUNC=MADLIB_SCHEMA.float8_cg_update_final
	);

	/**
	* @internal
	* @brief Perform one iteration of the iteratively-reweighted-least-squares
	* method for computing linear regression
	*/
	CREATE AGGREGATE MADLIB_SCHEMA.logregr_irls_step(
	BOOLEAN,
	DOUBLE PRECISION[],
	MADLIB_SCHEMA.logregr_irls_state) (

	SFUNC=MADLIB_SCHEMA.float8_irls_update_accum,
	STYPE=float8[],
	PREFUNC=MADLIB_SCHEMA.float8_mregr_combine,
	FINALFUNC=MADLIB_SCHEMA.float8_irls_update_final,
	INITCOND='{0}'
	);

	CREATE FUNCTION MADLIB_SCHEMA.logregr_should_terminate(
	DOUBLE PRECISION[],
	DOUBLE PRECISION[],
	VARCHAR,
	DOUBLE PRECISION)
	RETURNS BOOLEAN
	AS 'MODULE_PATHNAME'
	LANGUAGE C STRICT;

	-- begin functions for logistic-regression coefficients
	-- We only need to document the last one (unfortunately, in Greenplum we have to
	-- use function overloading instead of default arguments).
	CREATE FUNCTION MADLIB_SCHEMA.logregr_coef(
	"source" VARCHAR,
	"depColumn" VARCHAR,
	"indepColumn" VARCHAR)
	RETURNS DOUBLE PRECISION[] AS $$
	import sys
	try:
	ifdef(<nom4>DEBUG</nom4>,,<nom4>from madlib </nom4>)import logRegress
	except:
	sys.path.append("PLPYTHON_LIBDIR")
	ifdef(<nom4>DEBUG</nom4>,,<nom4>from madlib </nom4>)import logRegress

	return logRegress.compute_logregr_coef(**globals())
	$$ LANGUAGE plpythonu VOLATILE;

	CREATE FUNCTION MADLIB_SCHEMA.logregr_coef(
	"source" VARCHAR,
	"depColumn" VARCHAR,
	"indepColumn" VARCHAR,
	"numIterations" INTEGER)
	RETURNS DOUBLE PRECISION[] AS $$
	import sys
	try:
	ifdef(<nom4>DEBUG</nom4>,,<nom4>from madlib </nom4>)import logRegress
	except:
	sys.path.append("PLPYTHON_LIBDIR")
	ifdef(<nom4>DEBUG</nom4>,,<nom4>from madlib </nom4>)import logRegress

	return logRegress.compute_logregr_coef(**globals())
	$$ LANGUAGE plpythonu VOLATILE;

	CREATE FUNCTION MADLIB_SCHEMA.logregr_coef(
	"source" VARCHAR,
	"depColumn" VARCHAR,
	"indepColumn" VARCHAR,
	"numIterations" INTEGER,
	"optimizer" VARCHAR)
	RETURNS DOUBLE PRECISION[] AS $$
	import sys
	try:
	ifdef(<nom4>DEBUG</nom4>,,<nom4>from madlib </nom4>)import logRegress
	except:
	sys.path.append("PLPYTHON_LIBDIR")
	ifdef(<nom4>DEBUG</nom4>,,<nom4>from madlib </nom4>)import logRegress

	return logRegress.compute_logregr_coef(**globals())
	$$ LANGUAGE plpythonu VOLATILE;


	/**
	* @brief Compute logistic-regression coefficients
	*
	* To include an intercept in the model, set one coordinate in the
	* <tt>independentVariables</tt> array to 1.
	*
	* @param source Name of the source relation containing the training data
	* @param depColumn Name of the dependent column (of type BOOLEAN)
	* @param indepColumn Name of the independent column (of type DOUBLE
	* PRECISION[])
	* @param numIterations The maximum number of iterations
	* @param optimizer The optimizer to use (either
	* <tt>'ilrs'</tt>/<tt>'newton'</tt> for iteratively reweighted least
	* squares or <tt>'cg'</tt> for conjugent gradient)
	* @param precision The difference between log-likelihood values in successive
	* iterations that should indicate convergence, or 0 indicating that
	* log-likelihood values should be ignored
	*
	* @note This function starts an iterative algorithm. It is not an aggregate
	* function. Source and column names have to be passed as strings (due to
	* limitations of the SQL syntax).
	*
	* @examp <tt>SELECT logregr_coef('data', 'y', 'array[1, x1, x2]', 20, 'cg',
	* 0.001);</tt>
	*
	* @internal
	* @sa This function is a wrapper for logRegress::compute_logregr_coef(), which
	* sets the default values.
	*/
	CREATE FUNCTION MADLIB_SCHEMA.logregr_coef(
	"source" VARCHAR,
	"depColumn" VARCHAR,
	"indepColumn" VARCHAR,
	"numIterations" INTEGER /+ DEFAULT 20 /,
	"optimizer" VARCHAR /+ DEFAULT 'irls' /,
	"precision" DOUBLE PRECISION /+ DEFAULT 0.0001 /)
	RETURNS DOUBLE PRECISION[] AS $$
	import sys
	try:
	ifdef(<nom4>DEBUG</nom4>,,<nom4>from madlib </nom4>)import logRegress
	except:
	sys.path.append("PLPYTHON_LIBDIR")
	ifdef(<nom4>DEBUG</nom4>,,<nom4>from madlib </nom4>)import logRegress

	return logRegress.compute_logregr_coef(**globals())
	$$ LANGUAGE plpythonu VOLATILE;


	ifdef(<nom4>PGXS</nom4>,<nom4>
	/*
	* Function for initializing python paths to the paths in dynamic_library_path.
	* This is only needed when debugging Python-based functions without installing
	* them in a location where Python would find them automatically.
	*/
	CREATE FUNCTION MADLIB_SCHEMA.init_python_paths()
	RETURNS VOID AS
	$$
	# FIXME: The following code should be common code and not reside in a specialized module
	import sys

	dyld_paths = plpy.execute(
	"SHOW dynamic_library_path")[0]["dynamic_library_path"].split(':')
	before_default = True
	count = 0
	for path in dyld_paths:
	if path == "$libdir":
	before_default = False
	else:
	if before_default:
	sys.path.insert(count, path)
	count += 1
	else:
	sys.path.append(path)
	$$ LANGUAGE plpythonu VOLATILE;
	</nom4>)