blob: cdb8225c155079b91c24e1a108f29be137bbcc5c [file] [log] [blame]
/* ----------------------------------------------------------------------- *//**
*
* @file linear.sql_in
*
* @brief SQL functions for linear regression
* @date January 2011
*
* @sa For a brief introduction to linear regression, see the module
* description \ref grp_linreg.
*
*//* ----------------------------------------------------------------------- */
m4_include(`SQLCommon.m4')
/**
@addtogroup grp_linreg
@about
Ordinary least-squares (OLS) linear regression refers to a stochastic model in
which the conditional mean of the dependent variable (usually denoted \f$ Y \f$)
is an affine function of the vector of independent variables (usually denoted
\f$ \boldsymbol x \f$). That is,
\f[
E[Y \mid \boldsymbol x] = \boldsymbol c^T \boldsymbol x
\f]
for some unknown vector of coefficients \f$ \boldsymbol c \f$. The assumption is
that the residuals are i.i.d. distributed Gaussians. That is, the (conditional)
probability density of \f$ Y \f$ is given by
\f[
f(y \mid \boldsymbol x)
= \frac{1}{\sqrt{2 \pi \sigma^2}}
\cdot \exp\left(-\frac{1}{2 \sigma^2}
\cdot (y - \boldsymbol x^T \boldsymbol c)^2 \right)
\,.
\f]
OLS linear regression finds the vector of coefficients \f$ \boldsymbol c \f$
that maximizes the likelihood of the observations.
Let
- \f$ \boldsymbol y \in \mathbf R^n \f$ denote the vector of observed dependent variables, with
\f$ n \f$ rows, containing the observed values of the dependent variable,
- \f$ X \in \mathbf R^{n \times k} \f$ denote the design matrix with \f$ k \f$ columns and \f$ n \f$ rows,
containing all observed vectors of independent variables
\f$ \boldsymbol x_i \f$ as rows,
- \f$ X^T \f$ denote the transpose of \f$ X \f$,
- \f$ X^+ \f$ denote the pseudo-inverse of \f$ X \f$.
Maximizing the likelihood is equivalent to maximizing the log-likelihood
\f$ \sum_{i=1}^n \log f(y_i \mid \boldsymbol x_i) \f$, which simplifies to
maximizing the <b>residual sum of squares</b> \f$ RSS \f$ (also called sum of
squared residuals or sum of squared errors of prediction),
\f[
RSS = \sum_{i=1}^n ( y_i - \boldsymbol c^T \boldsymbol x_i )^2
= (\boldsymbol y - X \boldsymbol c)^T (\boldsymbol y - X \boldsymbol c)
\,.
\f]
The first-order conditions yield that the \f$ RSS \f$ is maximized at
\f[
\boldsymbol c = (X^T X)^+ X^T \boldsymbol y
\,.
\f]
Computing the <b>total sum of squares</b> \f$ TSS \f$, the <b>explained
sum of squares</b> \f$ ESS \f$ (also called the regression sum of
squares), and the <b>coefficient of determination</b> \f$ R^2 \f$ is
done according to the following formulas:
\f{align*}{
ESS & = \boldsymbol y^T X \boldsymbol c
- \frac{ \| y \|_1^2 }{n} \\
TSS & = \sum_{i=1}^n y_i^2
- \frac{ \| y \|_1^2 }{n} \\
R^2 & = \frac{ESS}{TSS}
\f}
Note: The last equality follows from the definition
\f$ R^2 = 1 - \frac{RSS}{TSS} \f$ and the fact that for linear regression
\f$ TSS = RSS + ESS \f$. A proof of the latter can be found, e.g., at:
http://en.wikipedia.org/wiki/Sum_of_squares
We estimate the variance
\f$ Var[Y - \boldsymbol c^T \boldsymbol x \mid \boldsymbol x] \f$ as
\f[
\sigma^2 = \frac{RSS}{n - k}
\f]
and compute the t-statistic for coefficient \f$ i \f$ as
\f[
t_i = \frac{c_i}{\sqrt{\sigma^2 \cdot \left( (X^T X)^{-1} \right)_{ii} }}
\,.
\f]
The \f$ p \f$-value for coefficient \f$ i \f$ gives the probability that the
null hypothesis (\f$ c_i = 0 \f$) is false, i.e., the probability that
\f$ c_i \f$ differs significantly from 0. Letting \f$ F_\nu \f$ denote the
cumulative density function of student-t with \f$ \nu \f$ degrees of freedom,
the \f$ p \f$-value for coefficient \f$ i \f$
is therefore
\f[
p_i = \Pr(|T| \geq |t_i|) = 2 \cdot (1 - F_{n - k}( |t_i| ))
\f]
where \f$ T \f$ is a student-t distributed random variable with mean 0.
@input
The training data is expected to be of the following form:
<pre>{TABLE|VIEW} <em>sourceName</em> (
...
<em>dependentVariable</em> FLOAT8,
<em>independentVariables</em> FLOAT8[],
...
)</pre>
@usage
- Get vector of coefficients \f$ \boldsymbol c \f$ and all diagnostic statistics:
<pre>SELECT (\ref linregr(<em>dependentVariable</em>, <em>independentVariables</em>)).*
FROM <em>sourceName</em>;</pre>
Output:
<pre>coef | r2 | std_err | t_stats | p_values
-----+----+---------+---------+---------
...
</pre>
- Get vector of coefficients \f$ \boldsymbol c \f$:\n
<pre>SELECT (\ref linregr(<em>dependentVariable</em>, <em>independentVariables</em>)).coef
FROM <em>sourceName</em>;</pre>
- Get array of coefficients \f$ \boldsymbol c \f$, coefficient of
determination \f$ R^2 \f$, and array of p-values \f$ \boldsymbol p \f$:
<pre>SELECT (lr).coef, (lr).r2, (lr).p_values
FROM (
SELECT \ref linregr(<em>dependentVariable</em>, <em>independentVariables</em>) AS lr
FROM <em>sourceName</em>
) AS subq;</pre>
@examp
The following example is taken from http://www.stat.columbia.edu/~martin/W2110/SAS_7.pdf.
-# Create the sample data set:
\verbatim
sql> CREATE TABLE houses (id INT, tax INT, bedroom INT, bath FLOAT, price INT, size INT, lot INT);
sql> COPY houses FROM STDIN WITH DELIMITER '|';
1 | 590 | 2 | 1 | 50000 | 770 | 22100
2 | 1050 | 3 | 2 | 85000 | 1410 | 12000
3 | 20 | 3 | 1 | 22500 | 1060 | 3500
4 | 870 | 2 | 2 | 90000 | 1300 | 17500
5 | 1320 | 3 | 2 | 133000 | 1500 | 30000
6 | 1350 | 2 | 1 | 90500 | 820 | 25700
7 | 2790 | 3 | 2.5 | 260000 | 2130 | 25000
8 | 680 | 2 | 1 | 142500 | 1170 | 22000
9 | 1840 | 3 | 2 | 160000 | 1500 | 19000
10 | 3680 | 4 | 2 | 240000 | 2790 | 20000
11 | 1660 | 3 | 1 | 87000 | 1030 | 17500
12 | 1620 | 3 | 2 | 118600 | 1250 | 20000
13 | 3100 | 3 | 2 | 140000 | 1760 | 38000
14 | 2070 | 2 | 3 | 148000 | 1550 | 14000
15 | 650 | 3 | 1.5 | 65000 | 1450 | 12000
\.
\endverbatim
-# You can call the linregr() function for an individual metric:
\verbatim
sql> SELECT (linregr(price, array[1, bedroom, bath, size])).coef FROM houses;
coef
------------------------------------------------------------------------
{27923.4332080858,-35524.7752261679,2269.34397934866,130.793920087862}
(1 row)
sql> SELECT (linregr(price, array[1, bedroom, bath, size])).r2 FROM houses;
r2
------------------
0.74537400999234
(1 row)
sql> SELECT (linregr(price, array[1, bedroom, bath, size])).std_err FROM houses;
std_err
----------------------------------------------------------------------
{56306.482134735,25036.6536954409,22208.6687273202,36.2086422648744}
(1 row)
sql> SELECT (linregr(price, array[1, bedroom, bath, size])).t_stats FROM houses;
t_stats
-----------------------------------------------------------------------
{0.49591862516412,-1.4189106762553,0.102182801104013,3.6122293437869}
(1 row)
sql> SELECT (linregr(price, array[1, bedroom, bath, size])).p_values FROM houses;
p_values
-----------------------------------------------------------------------------
{0.629711071585906,0.183633156540304,0.920450512607465,0.00408159080204573}
(1 row)
\endverbatim
-# Alternatively you can call the linreg() function for the full record:
\verbatim
sql> SELECT (linregr(price, array[1, bedroom, bath, size])).* FROM houses;
coef | r2 | std_err | t_stats | p_values
------------------------------------------------------------------------+------------------+----------------------------------------------------------------------+-----------------------------------------------------------------------+-----------------------------------------------------------------------------
{27923.4332080858,-35524.7752261679,2269.34397934866,130.793920087862} | 0.74537400999234 | {56306.482134735,25036.6536954409,22208.6687273202,36.2086422648744} | {0.49591862516412,-1.4189106762553,0.102182801104013,3.6122293437869} | {0.629711071585906,0.183633156540304,0.920450512607465,0.00408159080204573}
(1 row)
\endverbatim
@literature
[1] Cosma Shalizi: Statistics 36-350: Data Mining, Lecture Notes, 21 October
2009, http://www.stat.cmu.edu/~cshalizi/350/lectures/17/lecture-17.pdf
@sa File linear.sql_in documenting the SQL functions.
@internal
@sa Class \ref madlib::modules::regress::LinearRegression "LinearRegression"
documenting the implementation in C++
@endinternal
*/
CREATE TYPE MADLIB_SCHEMA.linregr_result AS (
coef DOUBLE PRECISION[],
r2 DOUBLE PRECISION,
std_err DOUBLE PRECISION[],
t_stats DOUBLE PRECISION[],
p_values DOUBLE PRECISION[]
);
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linregr_transition(
state DOUBLE PRECISION[],
y DOUBLE PRECISION,
x DOUBLE PRECISION[])
RETURNS DOUBLE PRECISION[]
AS 'MODULE_PATHNAME'
LANGUAGE C
IMMUTABLE STRICT;
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linregr_merge_states(
state1 DOUBLE PRECISION[],
state2 DOUBLE PRECISION[])
RETURNS DOUBLE PRECISION[]
AS 'MODULE_PATHNAME'
LANGUAGE C
IMMUTABLE STRICT;
-- Final functions
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linregr_final(
state DOUBLE PRECISION[])
RETURNS MADLIB_SCHEMA.linregr_result
AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE STRICT;
/**
* @brief Compute linear regression coefficients and diagnostic statistics.
*
* @param dependentVariable Column containing the dependent variable
* @param independentVariables Column containing the array of independent variables
*
* @par
* To include an intercept in the model, set one coordinate in the
* <tt>independentVariables</tt> array to 1.
*
* @return A composite value:
* - <tt>coef FLOAT8[]</tt> - Array of coefficients, \f$ \boldsymbol c \f$
* - <tt>r2 FLOAT8</tt> - Coefficient of determination, \f$ R^2 \f$
* - <tt>std_err FLOAT8[]</tt> - Array of standard errors,
* \f$ \mathit{se}(c_1), \dots, \mathit{se}(c_k) \f$
* - <tt>t_stats FLOAT8[]</tt> - Array of t-statistics, \f$ \boldsymbol t \f$
* - <tt>p_values FLOAT8[]</tt> - Array of p-values, \f$ \boldsymbol p \f$
*
* @usage
* - Get vector of coefficients \f$ \boldsymbol c \f$ and all diagnostic
* statistics:\n
* <pre>SELECT (linregr(<em>dependentVariable</em>, <em>independentVariables</em>)).*
*FROM <em>sourceName</em>;</pre>
* - Get vector of coefficients \f$ \boldsymbol c \f$:\n
* <pre>SELECT (linregr(<em>dependentVariable</em>, <em>independentVariables</em>)).coef
*FROM <em>sourceName</em>;</pre>
* - Get array of coefficients \f$ \boldsymbol c \f$, coefficient of
* determination \f$ R^2 \f$, and array of p-values \f$ \boldsymbol p \f$:
* <pre>SELECT (lr).coef, (lr).r2, (lr).p_values
*FROM (
* SELECT linregr(<em>dependentVariable</em>, <em>independentVariables</em>) AS lr
* FROM <em>sourceName</em>
*) AS subq;</pre>
*/
CREATE AGGREGATE MADLIB_SCHEMA.linregr(
/*+ "dependentVariable" */ DOUBLE PRECISION,
/*+ "independentVariables" */ DOUBLE PRECISION[]) (
SFUNC=MADLIB_SCHEMA.linregr_transition,
STYPE=float8[],
FINALFUNC=MADLIB_SCHEMA.linregr_final,
m4_ifdef(`GREENPLUM',`prefunc=MADLIB_SCHEMA.linregr_merge_states,')
INITCOND='{0,0,0,0,0}'
);