src/ports/postgres/modules/regress/linear.sql_in - madlib - Git at Google

 /* ----------------------------------------------------------------------- *//**
  *
  * @file linear.sql_in
  *
  * @brief SQL functions for linear regression
  * @date January 2011
  *
  * @sa For a brief introduction to linear regression, see the module
  *     description \ref grp_linreg.
  *
  *//* ----------------------------------------------------------------------- */

 m4_include(`SQLCommon.m4')

 /**
 @addtogroup grp_linreg

 @about

 Ordinary least-squares (OLS) linear regression refers to a stochastic model in
 which the conditional mean of the dependent variable (usually denoted \f$ Y \f$)
 is an affine function of the vector of independent variables (usually denoted
 \f$ \boldsymbol x \f$). That is,
 \f[
     E[Y \mid \boldsymbol x] = \boldsymbol c^T \boldsymbol x
 \f]
 for some unknown vector of coefficients \f$ \boldsymbol c \f$. The assumption is
 that the residuals are i.i.d. distributed Gaussians. That is, the (conditional)
 probability density of \f$ Y \f$ is given by
 \f[
     f(y \mid \boldsymbol x)
     =   \frac{1}{\sqrt{2 \pi \sigma^2}}
         \cdot \exp\left(-\frac{1}{2 \sigma^2}
             \cdot (y - \boldsymbol x^T \boldsymbol c)^2 \right)
     \,.
 \f]
 OLS linear regression finds the vector of coefficients \f$ \boldsymbol c \f$
 that maximizes the likelihood of the observations.

 Let
 - \f$ \boldsymbol y \in \mathbf R^n \f$ denote the vector of observed dependent variables, with
   \f$ n \f$ rows, containing the observed values of the dependent variable,
 - \f$ X \in \mathbf R^{n \times k} \f$ denote the design matrix with \f$ k \f$ columns and \f$ n \f$ rows,
   containing all observed vectors of independent variables
   \f$ \boldsymbol x_i \f$ as rows,
 - \f$ X^T \f$ denote the transpose of \f$ X \f$,
 - \f$ X^+ \f$ denote the pseudo-inverse of \f$ X \f$.

 Maximizing the likelihood is equivalent to maximizing the log-likelihood
 \f$ \sum_{i=1}^n \log f(y_i \mid \boldsymbol x_i) \f$, which simplifies to
 maximizing the <b>residual sum of squares</b> \f$ RSS \f$ (also called sum of
 squared residuals or sum of squared errors of prediction),
 \f[
     RSS = \sum_{i=1}^n ( y_i - \boldsymbol c^T \boldsymbol x_i )^2
         = (\boldsymbol y - X \boldsymbol c)^T (\boldsymbol y - X \boldsymbol c)
     \,.
 \f]
 The first-order conditions yield that the \f$ RSS \f$ is maximized at
 \f[
     \boldsymbol c = (X^T X)^+ X^T \boldsymbol y
     \,.
 \f]

 Computing the <b>total sum of squares</b> \f$ TSS \f$, the <b>explained
 sum of squares</b> \f$ ESS \f$ (also called the regression sum of
 squares), and the <b>coefficient of determination</b> \f$ R^2 \f$ is
 done according to the following formulas:
 \f{align*}{
     ESS & = \boldsymbol y^T X \boldsymbol c
         -   \frac{ \| y \|_1^2 }{n} \\
     TSS & = \sum_{i=1}^n y_i^2
         -   \frac{ \| y \|_1^2 }{n} \\
     R^2 & = \frac{ESS}{TSS}
 \f}
 Note: The last equality follows from the definition
 \f$ R^2 = 1 - \frac{RSS}{TSS} \f$ and the fact that for linear regression
 \f$ TSS = RSS + ESS \f$. A proof of the latter can be found, e.g., at:
 http://en.wikipedia.org/wiki/Sum_of_squares

 We estimate the variance
 \f$ Var[Y - \boldsymbol c^T \boldsymbol x \mid \boldsymbol x] \f$ as
 \f[
     \sigma^2 = \frac{RSS}{n - k}
 \f]
 and compute the t-statistic for coefficient \f$ i \f$ as
 \f[
     t_i = \frac{c_i}{\sqrt{\sigma^2 \cdot \left( (X^T X)^{-1} \right)_{ii} }}
     \,.
 \f]

 The \f$ p \f$-value for coefficient \f$ i \f$ gives the probability that the
 null hypothesis (\f$ c_i = 0 \f$) is false, i.e., the probability that
 \f$ c_i \f$ differs significantly from 0. Letting \f$ F_\nu \f$ denote the
 cumulative density function of student-t with \f$ \nu \f$ degrees of freedom,
 the \f$ p \f$-value for coefficient \f$ i \f$
 is therefore
 \f[
     p_i = \Pr(|T| \geq |t_i|) = 2 \cdot (1 - F_{n - k}( |t_i| ))
 \f]
 where \f$ T \f$ is a student-t distributed random variable with mean 0.

 @input

 The training data is expected to be of the following form:
 <pre>{TABLE|VIEW} <em>sourceName</em> (
     ...
     <em>dependentVariable</em> FLOAT8,
     <em>independentVariables</em> FLOAT8[],
     ...
 )</pre>

 @usage

 - Get vector of coefficients \f$ \boldsymbol c \f$ and all diagnostic statistics:
 <pre>SELECT (\ref linregr(<em>dependentVariable</em>, <em>independentVariables</em>)).*
 FROM <em>sourceName</em>;</pre>
   Output:
   <pre>coef | r2 | std_err | t_stats | p_values
 -----+----+---------+---------+---------
                    ...
 </pre>
 - Get vector of coefficients \f$ \boldsymbol c \f$:\n
   <pre>SELECT (\ref linregr(<em>dependentVariable</em>, <em>independentVariables</em>)).coef
 FROM <em>sourceName</em>;</pre>
 - Get array of coefficients \f$ \boldsymbol c \f$, coefficient of
   determination \f$ R^2 \f$, and array of p-values \f$ \boldsymbol p \f$:
       <pre>SELECT (lr).coef, (lr).r2, (lr).p_values
 FROM (
     SELECT \ref linregr(<em>dependentVariable</em>, <em>independentVariables</em>) AS lr
     FROM <em>sourceName</em>
 ) AS subq;</pre>

 @examp

 The following example is taken from http://www.stat.columbia.edu/~martin/W2110/SAS_7.pdf.

 -#  Create the sample data set:
 \verbatim
 sql> CREATE TABLE houses (id INT, tax INT, bedroom INT, bath FLOAT, price INT, size INT, lot INT);
 sql> COPY houses FROM STDIN WITH DELIMITER '|';
   1 |  590 |       2 |    1 |  50000 |  770 | 22100
   2 | 1050 |       3 |    2 |  85000 | 1410 | 12000
   3 |   20 |       3 |    1 |  22500 | 1060 |  3500
   4 |  870 |       2 |    2 |  90000 | 1300 | 17500
   5 | 1320 |       3 |    2 | 133000 | 1500 | 30000
   6 | 1350 |       2 |    1 |  90500 |  820 | 25700
   7 | 2790 |       3 |  2.5 | 260000 | 2130 | 25000
   8 |  680 |       2 |    1 | 142500 | 1170 | 22000
   9 | 1840 |       3 |    2 | 160000 | 1500 | 19000
  10 | 3680 |       4 |    2 | 240000 | 2790 | 20000
  11 | 1660 |       3 |    1 |  87000 | 1030 | 17500
  12 | 1620 |       3 |    2 | 118600 | 1250 | 20000
  13 | 3100 |       3 |    2 | 140000 | 1760 | 38000
  14 | 2070 |       2 |    3 | 148000 | 1550 | 14000
  15 |  650 |       3 |  1.5 |  65000 | 1450 | 12000
 \.
 \endverbatim
 -#  You can call the linregr() function for an individual metric:
 \verbatim
 sql> SELECT (linregr(price, array[1, bedroom, bath, size])).coef FROM houses;
                                   coef
 ------------------------------------------------------------------------
  {27923.4332080858,-35524.7752261679,2269.34397934866,130.793920087862}
 (1 row)

 sql> SELECT (linregr(price, array[1, bedroom, bath, size])).r2 FROM houses;
         r2
 ------------------
  0.74537400999234
 (1 row)

 sql> SELECT (linregr(price, array[1, bedroom, bath, size])).std_err FROM houses;
                                std_err
 ----------------------------------------------------------------------
  {56306.482134735,25036.6536954409,22208.6687273202,36.2086422648744}
 (1 row)

 sql> SELECT (linregr(price, array[1, bedroom, bath, size])).t_stats FROM houses;
                                 t_stats
 -----------------------------------------------------------------------
  {0.49591862516412,-1.4189106762553,0.102182801104013,3.6122293437869}
 (1 row)

 sql> SELECT (linregr(price, array[1, bedroom, bath, size])).p_values FROM houses;
                                   p_values
 -----------------------------------------------------------------------------
  {0.629711071585906,0.183633156540304,0.920450512607465,0.00408159080204573}
 (1 row)
 \endverbatim
 -#  Alternatively you can call the linreg() function for the full record:
 \verbatim
 sql> SELECT (linregr(price, array[1, bedroom, bath, size])).* FROM houses;
                                   coef                                  |        r2        |                               std_err                                |                                t_stats                                |                                  p_values
 ------------------------------------------------------------------------+------------------+----------------------------------------------------------------------+-----------------------------------------------------------------------+-----------------------------------------------------------------------------
  {27923.4332080858,-35524.7752261679,2269.34397934866,130.793920087862} | 0.74537400999234 | {56306.482134735,25036.6536954409,22208.6687273202,36.2086422648744} | {0.49591862516412,-1.4189106762553,0.102182801104013,3.6122293437869} | {0.629711071585906,0.183633156540304,0.920450512607465,0.00408159080204573}
 (1 row)
 \endverbatim

 @literature

 [1] Cosma Shalizi: Statistics 36-350: Data Mining, Lecture Notes, 21 October
     2009, http://www.stat.cmu.edu/~cshalizi/350/lectures/17/lecture-17.pdf

 @sa File linear.sql_in documenting the SQL functions.

 @internal
 @sa Class \ref madlib::modules::regress::LinearRegression "LinearRegression"
     documenting the implementation in C++
 @endinternal
 */

 CREATE TYPE MADLIB_SCHEMA.linregr_result AS (
     coef DOUBLE PRECISION[],
     r2 DOUBLE PRECISION,
     std_err DOUBLE PRECISION[],
     t_stats DOUBLE PRECISION[],
     p_values DOUBLE PRECISION[]
 );

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linregr_transition(
     state DOUBLE PRECISION[],
     y DOUBLE PRECISION,
     x DOUBLE PRECISION[])
 RETURNS DOUBLE PRECISION[]
 AS 'MODULE_PATHNAME'
 LANGUAGE C
 IMMUTABLE STRICT;

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linregr_merge_states(
     state1 DOUBLE PRECISION[],
     state2 DOUBLE PRECISION[])
 RETURNS DOUBLE PRECISION[]
 AS 'MODULE_PATHNAME'
 LANGUAGE C
 IMMUTABLE STRICT;

 -- Final functions
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linregr_final(
     state DOUBLE PRECISION[])
 RETURNS MADLIB_SCHEMA.linregr_result
 AS 'MODULE_PATHNAME'
 LANGUAGE C IMMUTABLE STRICT;

 /**
  * @brief Compute linear regression coefficients and diagnostic statistics.
  *
  * @param dependentVariable Column containing the dependent variable
  * @param independentVariables Column containing the array of independent variables
  *
  * @par
  * To include an intercept in the model, set one coordinate in the
  * <tt>independentVariables</tt> array to 1.
  *
  * @return A composite value:
  *  - <tt>coef FLOAT8[]</tt> - Array of coefficients, \f$ \boldsymbol c \f$
  *  - <tt>r2 FLOAT8</tt> - Coefficient of determination, \f$ R^2 \f$
  *  - <tt>std_err FLOAT8[]</tt> - Array of standard errors,
  *    \f$ \mathit{se}(c_1), \dots, \mathit{se}(c_k) \f$
  *  - <tt>t_stats FLOAT8[]</tt> - Array of t-statistics, \f$ \boldsymbol t \f$
  *  - <tt>p_values FLOAT8[]</tt> - Array of p-values, \f$ \boldsymbol p \f$
  *
  * @usage
  *  - Get vector of coefficients \f$ \boldsymbol c \f$ and all diagnostic
  *    statistics:\n
  *    <pre>SELECT (linregr(<em>dependentVariable</em>, <em>independentVariables</em>)).*
  *FROM <em>sourceName</em>;</pre>
  *  - Get vector of coefficients \f$ \boldsymbol c \f$:\n
  *    <pre>SELECT (linregr(<em>dependentVariable</em>, <em>independentVariables</em>)).coef
  *FROM <em>sourceName</em>;</pre>
  *  - Get array of coefficients \f$ \boldsymbol c \f$, coefficient of
  *    determination \f$ R^2 \f$, and array of p-values \f$ \boldsymbol p \f$:
  *    <pre>SELECT (lr).coef, (lr).r2, (lr).p_values
  *FROM (
  *    SELECT linregr(<em>dependentVariable</em>, <em>independentVariables</em>) AS lr
  *    FROM <em>sourceName</em>
  *) AS subq;</pre>
  */
 CREATE AGGREGATE MADLIB_SCHEMA.linregr(
     /*+ "dependentVariable" */ DOUBLE PRECISION,
     /*+ "independentVariables" */ DOUBLE PRECISION[]) (

     SFUNC=MADLIB_SCHEMA.linregr_transition,
     STYPE=float8[],
     FINALFUNC=MADLIB_SCHEMA.linregr_final,
     m4_ifdef(`GREENPLUM',`prefunc=MADLIB_SCHEMA.linregr_merge_states,')
     INITCOND='{0,0,0,0,0}'
 );
	/* ----------------------------------------------------------------------- //*
	*
	* @file linear.sql_in
	*
	* @brief SQL functions for linear regression
	* @date January 2011
	*
	* @sa For a brief introduction to linear regression, see the module
	* description \ref grp_linreg.
	*
	// ----------------------------------------------------------------------- */

	m4_include(`SQLCommon.m4')

	/**
	@addtogroup grp_linreg

	@about

	Ordinary least-squares (OLS) linear regression refers to a stochastic model in
	which the conditional mean of the dependent variable (usually denoted \f$ Y \f$)
	is an affine function of the vector of independent variables (usually denoted
	\f$ \boldsymbol x \f$). That is,
	\f[
	E[Y \mid \boldsymbol x] = \boldsymbol c^T \boldsymbol x
	\f]
	for some unknown vector of coefficients \f$ \boldsymbol c \f$. The assumption is
	that the residuals are i.i.d. distributed Gaussians. That is, the (conditional)
	probability density of \f$ Y \f$ is given by
	\f[
	f(y \mid \boldsymbol x)
	= \frac{1}{\sqrt{2 \pi \sigma^2}}
	\cdot \exp\left(-\frac{1}{2 \sigma^2}
	\cdot (y - \boldsymbol x^T \boldsymbol c)^2 \right)
	\,.
	\f]
	OLS linear regression finds the vector of coefficients \f$ \boldsymbol c \f$
	that maximizes the likelihood of the observations.

	Let
	- \f$ \boldsymbol y \in \mathbf R^n \f$ denote the vector of observed dependent variables, with
	\f$ n \f$ rows, containing the observed values of the dependent variable,
	- \f$ X \in \mathbf R^{n \times k} \f$ denote the design matrix with \f$ k \f$ columns and \f$ n \f$ rows,
	containing all observed vectors of independent variables
	\f$ \boldsymbol x_i \f$ as rows,
	- \f$ X^T \f$ denote the transpose of \f$ X \f$,
	- \f$ X^+ \f$ denote the pseudo-inverse of \f$ X \f$.

	Maximizing the likelihood is equivalent to maximizing the log-likelihood
	\f$ \sum_{i=1}^n \log f(y_i \mid \boldsymbol x_i) \f$, which simplifies to
	maximizing the <b>residual sum of squares</b> \f$ RSS \f$ (also called sum of
	squared residuals or sum of squared errors of prediction),
	\f[
	RSS = \sum_{i=1}^n ( y_i - \boldsymbol c^T \boldsymbol x_i )^2
	= (\boldsymbol y - X \boldsymbol c)^T (\boldsymbol y - X \boldsymbol c)
	\,.
	\f]
	The first-order conditions yield that the \f$ RSS \f$ is maximized at
	\f[
	\boldsymbol c = (X^T X)^+ X^T \boldsymbol y
	\,.
	\f]

	Computing the <b>total sum of squares</b> \f$ TSS \f$, the <b>explained
	sum of squares</b> \f$ ESS \f$ (also called the regression sum of
	squares), and the <b>coefficient of determination</b> \f$ R^2 \f$ is
	done according to the following formulas:
	\f{align*}{
	ESS & = \boldsymbol y^T X \boldsymbol c
	- \frac{ \\| y \\|_1^2 }{n} \\
	TSS & = \sum_{i=1}^n y_i^2
	- \frac{ \\| y \\|_1^2 }{n} \\
	R^2 & = \frac{ESS}{TSS}
	\f}
	Note: The last equality follows from the definition
	\f$ R^2 = 1 - \frac{RSS}{TSS} \f$ and the fact that for linear regression
	\f$ TSS = RSS + ESS \f$. A proof of the latter can be found, e.g., at:
	http://en.wikipedia.org/wiki/Sum_of_squares

	We estimate the variance
	\f$ Var[Y - \boldsymbol c^T \boldsymbol x \mid \boldsymbol x] \f$ as
	\f[
	\sigma^2 = \frac{RSS}{n - k}
	\f]
	and compute the t-statistic for coefficient \f$ i \f$ as
	\f[
	t_i = \frac{c_i}{\sqrt{\sigma^2 \cdot \left( (X^T X)^{-1} \right)_{ii} }}
	\,.
	\f]

	The \f$ p \f$-value for coefficient \f$ i \f$ gives the probability that the
	null hypothesis (\f$ c_i = 0 \f$) is false, i.e., the probability that
	\f$ c_i \f$ differs significantly from 0. Letting \f$ F_\nu \f$ denote the
	cumulative density function of student-t with \f$ \nu \f$ degrees of freedom,
	the \f$ p \f$-value for coefficient \f$ i \f$
	is therefore
	\f[
	p_i = \Pr(\|T\| \geq \|t_i\|) = 2 \cdot (1 - F_{n - k}( \|t_i\| ))
	\f]
	where \f$ T \f$ is a student-t distributed random variable with mean 0.

	@input

	The training data is expected to be of the following form:
	<pre>{TABLE\|VIEW} <em>sourceName</em> (
	...
	<em>dependentVariable</em> FLOAT8,
	<em>independentVariables</em> FLOAT8[],
	...
	)</pre>

	@usage

	- Get vector of coefficients \f$ \boldsymbol c \f$ and all diagnostic statistics:
	<pre>SELECT (\ref linregr(<em>dependentVariable</em>, <em>independentVariables</em>)).*
	FROM <em>sourceName</em>;</pre>
	Output:
	<pre>coef \| r2 \| std_err \| t_stats \| p_values
	-----+----+---------+---------+---------
	...
	</pre>
	- Get vector of coefficients \f$ \boldsymbol c \f$:\n
	<pre>SELECT (\ref linregr(<em>dependentVariable</em>, <em>independentVariables</em>)).coef
	FROM <em>sourceName</em>;</pre>
	- Get array of coefficients \f$ \boldsymbol c \f$, coefficient of
	determination \f$ R^2 \f$, and array of p-values \f$ \boldsymbol p \f$:
	<pre>SELECT (lr).coef, (lr).r2, (lr).p_values
	FROM (
	SELECT \ref linregr(<em>dependentVariable</em>, <em>independentVariables</em>) AS lr
	FROM <em>sourceName</em>
	) AS subq;</pre>

	@examp

	The following example is taken from http://www.stat.columbia.edu/~martin/W2110/SAS_7.pdf.

	-# Create the sample data set:
	\verbatim
	sql> CREATE TABLE houses (id INT, tax INT, bedroom INT, bath FLOAT, price INT, size INT, lot INT);
	sql> COPY houses FROM STDIN WITH DELIMITER '\|';
	1 \| 590 \| 2 \| 1 \| 50000 \| 770 \| 22100
	2 \| 1050 \| 3 \| 2 \| 85000 \| 1410 \| 12000
	3 \| 20 \| 3 \| 1 \| 22500 \| 1060 \| 3500
	4 \| 870 \| 2 \| 2 \| 90000 \| 1300 \| 17500
	5 \| 1320 \| 3 \| 2 \| 133000 \| 1500 \| 30000
	6 \| 1350 \| 2 \| 1 \| 90500 \| 820 \| 25700
	7 \| 2790 \| 3 \| 2.5 \| 260000 \| 2130 \| 25000
	8 \| 680 \| 2 \| 1 \| 142500 \| 1170 \| 22000
	9 \| 1840 \| 3 \| 2 \| 160000 \| 1500 \| 19000
	10 \| 3680 \| 4 \| 2 \| 240000 \| 2790 \| 20000
	11 \| 1660 \| 3 \| 1 \| 87000 \| 1030 \| 17500
	12 \| 1620 \| 3 \| 2 \| 118600 \| 1250 \| 20000
	13 \| 3100 \| 3 \| 2 \| 140000 \| 1760 \| 38000
	14 \| 2070 \| 2 \| 3 \| 148000 \| 1550 \| 14000
	15 \| 650 \| 3 \| 1.5 \| 65000 \| 1450 \| 12000
	\.
	\endverbatim
	-# You can call the linregr() function for an individual metric:
	\verbatim
	sql> SELECT (linregr(price, array[1, bedroom, bath, size])).coef FROM houses;
	coef
	------------------------------------------------------------------------
	{27923.4332080858,-35524.7752261679,2269.34397934866,130.793920087862}
	(1 row)

	sql> SELECT (linregr(price, array[1, bedroom, bath, size])).r2 FROM houses;
	r2
	------------------
	0.74537400999234
	(1 row)

	sql> SELECT (linregr(price, array[1, bedroom, bath, size])).std_err FROM houses;
	std_err
	----------------------------------------------------------------------
	{56306.482134735,25036.6536954409,22208.6687273202,36.2086422648744}
	(1 row)

	sql> SELECT (linregr(price, array[1, bedroom, bath, size])).t_stats FROM houses;
	t_stats
	-----------------------------------------------------------------------
	{0.49591862516412,-1.4189106762553,0.102182801104013,3.6122293437869}
	(1 row)

	sql> SELECT (linregr(price, array[1, bedroom, bath, size])).p_values FROM houses;
	p_values
	-----------------------------------------------------------------------------
	{0.629711071585906,0.183633156540304,0.920450512607465,0.00408159080204573}
	(1 row)
	\endverbatim
	-# Alternatively you can call the linreg() function for the full record:
	\verbatim
	sql> SELECT (linregr(price, array[1, bedroom, bath, size])).* FROM houses;
	coef \| r2 \| std_err \| t_stats \| p_values
	------------------------------------------------------------------------+------------------+----------------------------------------------------------------------+-----------------------------------------------------------------------+-----------------------------------------------------------------------------
	{27923.4332080858,-35524.7752261679,2269.34397934866,130.793920087862} \| 0.74537400999234 \| {56306.482134735,25036.6536954409,22208.6687273202,36.2086422648744} \| {0.49591862516412,-1.4189106762553,0.102182801104013,3.6122293437869} \| {0.629711071585906,0.183633156540304,0.920450512607465,0.00408159080204573}
	(1 row)
	\endverbatim

	@literature

	[1] Cosma Shalizi: Statistics 36-350: Data Mining, Lecture Notes, 21 October
	2009, http://www.stat.cmu.edu/~cshalizi/350/lectures/17/lecture-17.pdf

	@sa File linear.sql_in documenting the SQL functions.

	@internal
	@sa Class \ref madlib::modules::regress::LinearRegression "LinearRegression"
	documenting the implementation in C++
	@endinternal
	*/

	CREATE TYPE MADLIB_SCHEMA.linregr_result AS (
	coef DOUBLE PRECISION[],
	r2 DOUBLE PRECISION,
	std_err DOUBLE PRECISION[],
	t_stats DOUBLE PRECISION[],
	p_values DOUBLE PRECISION[]
	);

	CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linregr_transition(
	state DOUBLE PRECISION[],
	y DOUBLE PRECISION,
	x DOUBLE PRECISION[])
	RETURNS DOUBLE PRECISION[]
	AS 'MODULE_PATHNAME'
	LANGUAGE C
	IMMUTABLE STRICT;

	CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linregr_merge_states(
	state1 DOUBLE PRECISION[],
	state2 DOUBLE PRECISION[])
	RETURNS DOUBLE PRECISION[]
	AS 'MODULE_PATHNAME'
	LANGUAGE C
	IMMUTABLE STRICT;

	-- Final functions
	CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linregr_final(
	state DOUBLE PRECISION[])
	RETURNS MADLIB_SCHEMA.linregr_result
	AS 'MODULE_PATHNAME'
	LANGUAGE C IMMUTABLE STRICT;

	/**
	* @brief Compute linear regression coefficients and diagnostic statistics.
	*
	* @param dependentVariable Column containing the dependent variable
	* @param independentVariables Column containing the array of independent variables
	*
	* @par
	* To include an intercept in the model, set one coordinate in the
	* <tt>independentVariables</tt> array to 1.
	*
	* @return A composite value:
	* - <tt>coef FLOAT8[]</tt> - Array of coefficients, \f$ \boldsymbol c \f$
	* - <tt>r2 FLOAT8</tt> - Coefficient of determination, \f$ R^2 \f$
	* - <tt>std_err FLOAT8[]</tt> - Array of standard errors,
	* \f$ \mathit{se}(c_1), \dots, \mathit{se}(c_k) \f$
	* - <tt>t_stats FLOAT8[]</tt> - Array of t-statistics, \f$ \boldsymbol t \f$
	* - <tt>p_values FLOAT8[]</tt> - Array of p-values, \f$ \boldsymbol p \f$
	*
	* @usage
	* - Get vector of coefficients \f$ \boldsymbol c \f$ and all diagnostic
	* statistics:\n
	* <pre>SELECT (linregr(<em>dependentVariable</em>, <em>independentVariables</em>)).*
	*FROM <em>sourceName</em>;</pre>
	* - Get vector of coefficients \f$ \boldsymbol c \f$:\n
	* <pre>SELECT (linregr(<em>dependentVariable</em>, <em>independentVariables</em>)).coef
	*FROM <em>sourceName</em>;</pre>
	* - Get array of coefficients \f$ \boldsymbol c \f$, coefficient of
	* determination \f$ R^2 \f$, and array of p-values \f$ \boldsymbol p \f$:
	* <pre>SELECT (lr).coef, (lr).r2, (lr).p_values
	*FROM (
	* SELECT linregr(<em>dependentVariable</em>, <em>independentVariables</em>) AS lr
	* FROM <em>sourceName</em>
	*) AS subq;</pre>
	*/
	CREATE AGGREGATE MADLIB_SCHEMA.linregr(
	/+ "dependentVariable" / DOUBLE PRECISION,
	/+ "independentVariables" / DOUBLE PRECISION[]) (

	SFUNC=MADLIB_SCHEMA.linregr_transition,
	STYPE=float8[],
	FINALFUNC=MADLIB_SCHEMA.linregr_final,
	m4_ifdef(`GREENPLUM',`prefunc=MADLIB_SCHEMA.linregr_merge_states,')
	INITCOND='{0,0,0,0,0}'
	);