| /* ----------------------------------------------------------------------- *//** |
| * |
| * @file linear.sql_in |
| * |
| * @brief SQL functions for linear regression |
| * @date January 2011 |
| * |
| * @sa For a brief introduction to linear regression, see the module |
| * description \ref grp_linreg. |
| * |
| *//* ----------------------------------------------------------------------- */ |
| |
| m4_include(`SQLCommon.m4') |
| |
| /** |
| @addtogroup grp_linreg |
| |
| @about |
| |
| Ordinary least-squares (OLS) linear regression refers to a stochastic model in |
| which the conditional mean of the dependent variable (usually denoted \f$ Y \f$) |
| is an affine function of the vector of independent variables (usually denoted |
| \f$ \boldsymbol x \f$). That is, |
| \f[ |
| E[Y \mid \boldsymbol x] = \boldsymbol c^T \boldsymbol x |
| \f] |
| for some unknown vector of coefficients \f$ \boldsymbol c \f$. The assumption is |
| that the residuals are i.i.d. distributed Gaussians. That is, the (conditional) |
| probability density of \f$ Y \f$ is given by |
| \f[ |
| f(y \mid \boldsymbol x) |
| = \frac{1}{\sqrt{2 \pi \sigma^2}} |
| \cdot \exp\left(-\frac{1}{2 \sigma^2} |
| \cdot (y - \boldsymbol x^T \boldsymbol c)^2 \right) |
| \,. |
| \f] |
| OLS linear regression finds the vector of coefficients \f$ \boldsymbol c \f$ |
| that maximizes the likelihood of the observations. |
| |
| Let |
| - \f$ \boldsymbol y \in \mathbf R^n \f$ denote the vector of observed dependent variables, with |
| \f$ n \f$ rows, containing the observed values of the dependent variable, |
| - \f$ X \in \mathbf R^{n \times k} \f$ denote the design matrix with \f$ k \f$ columns and \f$ n \f$ rows, |
| containing all observed vectors of independent variables |
| \f$ \boldsymbol x_i \f$ as rows, |
| - \f$ X^T \f$ denote the transpose of \f$ X \f$, |
| - \f$ X^+ \f$ denote the pseudo-inverse of \f$ X \f$. |
| |
| Maximizing the likelihood is equivalent to maximizing the log-likelihood |
| \f$ \sum_{i=1}^n \log f(y_i \mid \boldsymbol x_i) \f$, which simplifies to |
| maximizing the <b>residual sum of squares</b> \f$ RSS \f$ (also called sum of |
| squared residuals or sum of squared errors of prediction), |
| \f[ |
| RSS = \sum_{i=1}^n ( y_i - \boldsymbol c^T \boldsymbol x_i )^2 |
| = (\boldsymbol y - X \boldsymbol c)^T (\boldsymbol y - X \boldsymbol c) |
| \,. |
| \f] |
| The first-order conditions yield that the \f$ RSS \f$ is maximized at |
| \f[ |
| \boldsymbol c = (X^T X)^+ X^T \boldsymbol y |
| \,. |
| \f] |
| |
| Computing the <b>total sum of squares</b> \f$ TSS \f$, the <b>explained |
| sum of squares</b> \f$ ESS \f$ (also called the regression sum of |
| squares), and the <b>coefficient of determination</b> \f$ R^2 \f$ is |
| done according to the following formulas: |
| \f{align*}{ |
| ESS & = \boldsymbol y^T X \boldsymbol c |
| - \frac{ \| y \|_1^2 }{n} \\ |
| TSS & = \sum_{i=1}^n y_i^2 |
| - \frac{ \| y \|_1^2 }{n} \\ |
| R^2 & = \frac{ESS}{TSS} |
| \f} |
| Note: The last equality follows from the definition |
| \f$ R^2 = 1 - \frac{RSS}{TSS} \f$ and the fact that for linear regression |
| \f$ TSS = RSS + ESS \f$. A proof of the latter can be found, e.g., at: |
| http://en.wikipedia.org/wiki/Sum_of_squares |
| |
| We estimate the variance |
| \f$ Var[Y - \boldsymbol c^T \boldsymbol x \mid \boldsymbol x] \f$ as |
| \f[ |
| \sigma^2 = \frac{RSS}{n - k} |
| \f] |
| and compute the t-statistic for coefficient \f$ i \f$ as |
| \f[ |
| t_i = \frac{c_i}{\sqrt{\sigma^2 \cdot \left( (X^T X)^{-1} \right)_{ii} }} |
| \,. |
| \f] |
| |
| The \f$ p \f$-value for coefficient \f$ i \f$ gives the probability that the |
| null hypothesis (\f$ c_i = 0 \f$) is false, i.e., the probability that |
| \f$ c_i \f$ differs significantly from 0. Letting \f$ F_\nu \f$ denote the |
| cumulative density function of student-t with \f$ \nu \f$ degrees of freedom, |
| the \f$ p \f$-value for coefficient \f$ i \f$ |
| is therefore |
| \f[ |
| p_i = \Pr(|T| \geq |t_i|) = 2 \cdot (1 - F_{n - k}( |t_i| )) |
| \f] |
| where \f$ T \f$ is a student-t distributed random variable with mean 0. |
| |
| @input |
| |
| The training data is expected to be of the following form: |
| <pre>{TABLE|VIEW} <em>sourceName</em> ( |
| ... |
| <em>dependentVariable</em> FLOAT8, |
| <em>independentVariables</em> FLOAT8[], |
| ... |
| )</pre> |
| |
| @usage |
| |
| - Get vector of coefficients \f$ \boldsymbol c \f$ and all diagnostic statistics: |
| <pre>SELECT (\ref linregr(<em>dependentVariable</em>, <em>independentVariables</em>)).* |
| FROM <em>sourceName</em>;</pre> |
| Output: |
| <pre>coef | r2 | std_err | t_stats | p_values |
| -----+----+---------+---------+--------- |
| ... |
| </pre> |
| - Get vector of coefficients \f$ \boldsymbol c \f$:\n |
| <pre>SELECT (\ref linregr(<em>dependentVariable</em>, <em>independentVariables</em>)).coef |
| FROM <em>sourceName</em>;</pre> |
| - Get array of coefficients \f$ \boldsymbol c \f$, coefficient of |
| determination \f$ R^2 \f$, and array of p-values \f$ \boldsymbol p \f$: |
| <pre>SELECT (lr).coef, (lr).r2, (lr).p_values |
| FROM ( |
| SELECT \ref linregr(<em>dependentVariable</em>, <em>independentVariables</em>) AS lr |
| FROM <em>sourceName</em> |
| ) AS subq;</pre> |
| |
| @examp |
| |
| The following example is taken from http://www.stat.columbia.edu/~martin/W2110/SAS_7.pdf. |
| |
| -# Create the sample data set: |
| \verbatim |
| sql> CREATE TABLE houses (id INT, tax INT, bedroom INT, bath FLOAT, price INT, size INT, lot INT); |
| sql> COPY houses FROM STDIN WITH DELIMITER '|'; |
| 1 | 590 | 2 | 1 | 50000 | 770 | 22100 |
| 2 | 1050 | 3 | 2 | 85000 | 1410 | 12000 |
| 3 | 20 | 3 | 1 | 22500 | 1060 | 3500 |
| 4 | 870 | 2 | 2 | 90000 | 1300 | 17500 |
| 5 | 1320 | 3 | 2 | 133000 | 1500 | 30000 |
| 6 | 1350 | 2 | 1 | 90500 | 820 | 25700 |
| 7 | 2790 | 3 | 2.5 | 260000 | 2130 | 25000 |
| 8 | 680 | 2 | 1 | 142500 | 1170 | 22000 |
| 9 | 1840 | 3 | 2 | 160000 | 1500 | 19000 |
| 10 | 3680 | 4 | 2 | 240000 | 2790 | 20000 |
| 11 | 1660 | 3 | 1 | 87000 | 1030 | 17500 |
| 12 | 1620 | 3 | 2 | 118600 | 1250 | 20000 |
| 13 | 3100 | 3 | 2 | 140000 | 1760 | 38000 |
| 14 | 2070 | 2 | 3 | 148000 | 1550 | 14000 |
| 15 | 650 | 3 | 1.5 | 65000 | 1450 | 12000 |
| \. |
| \endverbatim |
| -# You can call the linregr() function for an individual metric: |
| \verbatim |
| sql> SELECT (linregr(price, array[1, bedroom, bath, size])).coef FROM houses; |
| coef |
| ------------------------------------------------------------------------ |
| {27923.4332080858,-35524.7752261679,2269.34397934866,130.793920087862} |
| (1 row) |
| |
| sql> SELECT (linregr(price, array[1, bedroom, bath, size])).r2 FROM houses; |
| r2 |
| ------------------ |
| 0.74537400999234 |
| (1 row) |
| |
| sql> SELECT (linregr(price, array[1, bedroom, bath, size])).std_err FROM houses; |
| std_err |
| ---------------------------------------------------------------------- |
| {56306.482134735,25036.6536954409,22208.6687273202,36.2086422648744} |
| (1 row) |
| |
| sql> SELECT (linregr(price, array[1, bedroom, bath, size])).t_stats FROM houses; |
| t_stats |
| ----------------------------------------------------------------------- |
| {0.49591862516412,-1.4189106762553,0.102182801104013,3.6122293437869} |
| (1 row) |
| |
| sql> SELECT (linregr(price, array[1, bedroom, bath, size])).p_values FROM houses; |
| p_values |
| ----------------------------------------------------------------------------- |
| {0.629711071585906,0.183633156540304,0.920450512607465,0.00408159080204573} |
| (1 row) |
| \endverbatim |
| -# Alternatively you can call the linreg() function for the full record: |
| \verbatim |
| sql> SELECT (linregr(price, array[1, bedroom, bath, size])).* FROM houses; |
| coef | r2 | std_err | t_stats | p_values |
| ------------------------------------------------------------------------+------------------+----------------------------------------------------------------------+-----------------------------------------------------------------------+----------------------------------------------------------------------------- |
| {27923.4332080858,-35524.7752261679,2269.34397934866,130.793920087862} | 0.74537400999234 | {56306.482134735,25036.6536954409,22208.6687273202,36.2086422648744} | {0.49591862516412,-1.4189106762553,0.102182801104013,3.6122293437869} | {0.629711071585906,0.183633156540304,0.920450512607465,0.00408159080204573} |
| (1 row) |
| \endverbatim |
| |
| @literature |
| |
| [1] Cosma Shalizi: Statistics 36-350: Data Mining, Lecture Notes, 21 October |
| 2009, http://www.stat.cmu.edu/~cshalizi/350/lectures/17/lecture-17.pdf |
| |
| @sa File linear.sql_in documenting the SQL functions. |
| |
| @internal |
| @sa Class \ref madlib::modules::regress::LinearRegression "LinearRegression" |
| documenting the implementation in C++ |
| @endinternal |
| */ |
| |
| CREATE TYPE MADLIB_SCHEMA.linregr_result AS ( |
| coef DOUBLE PRECISION[], |
| r2 DOUBLE PRECISION, |
| std_err DOUBLE PRECISION[], |
| t_stats DOUBLE PRECISION[], |
| p_values DOUBLE PRECISION[] |
| ); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linregr_transition( |
| state DOUBLE PRECISION[], |
| y DOUBLE PRECISION, |
| x DOUBLE PRECISION[]) |
| RETURNS DOUBLE PRECISION[] |
| AS 'MODULE_PATHNAME' |
| LANGUAGE C |
| IMMUTABLE STRICT; |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linregr_merge_states( |
| state1 DOUBLE PRECISION[], |
| state2 DOUBLE PRECISION[]) |
| RETURNS DOUBLE PRECISION[] |
| AS 'MODULE_PATHNAME' |
| LANGUAGE C |
| IMMUTABLE STRICT; |
| |
| -- Final functions |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linregr_final( |
| state DOUBLE PRECISION[]) |
| RETURNS MADLIB_SCHEMA.linregr_result |
| AS 'MODULE_PATHNAME' |
| LANGUAGE C IMMUTABLE STRICT; |
| |
| /** |
| * @brief Compute linear regression coefficients and diagnostic statistics. |
| * |
| * @param dependentVariable Column containing the dependent variable |
| * @param independentVariables Column containing the array of independent variables |
| * |
| * @par |
| * To include an intercept in the model, set one coordinate in the |
| * <tt>independentVariables</tt> array to 1. |
| * |
| * @return A composite value: |
| * - <tt>coef FLOAT8[]</tt> - Array of coefficients, \f$ \boldsymbol c \f$ |
| * - <tt>r2 FLOAT8</tt> - Coefficient of determination, \f$ R^2 \f$ |
| * - <tt>std_err FLOAT8[]</tt> - Array of standard errors, |
| * \f$ \mathit{se}(c_1), \dots, \mathit{se}(c_k) \f$ |
| * - <tt>t_stats FLOAT8[]</tt> - Array of t-statistics, \f$ \boldsymbol t \f$ |
| * - <tt>p_values FLOAT8[]</tt> - Array of p-values, \f$ \boldsymbol p \f$ |
| * |
| * @usage |
| * - Get vector of coefficients \f$ \boldsymbol c \f$ and all diagnostic |
| * statistics:\n |
| * <pre>SELECT (linregr(<em>dependentVariable</em>, <em>independentVariables</em>)).* |
| *FROM <em>sourceName</em>;</pre> |
| * - Get vector of coefficients \f$ \boldsymbol c \f$:\n |
| * <pre>SELECT (linregr(<em>dependentVariable</em>, <em>independentVariables</em>)).coef |
| *FROM <em>sourceName</em>;</pre> |
| * - Get array of coefficients \f$ \boldsymbol c \f$, coefficient of |
| * determination \f$ R^2 \f$, and array of p-values \f$ \boldsymbol p \f$: |
| * <pre>SELECT (lr).coef, (lr).r2, (lr).p_values |
| *FROM ( |
| * SELECT linregr(<em>dependentVariable</em>, <em>independentVariables</em>) AS lr |
| * FROM <em>sourceName</em> |
| *) AS subq;</pre> |
| */ |
| CREATE AGGREGATE MADLIB_SCHEMA.linregr( |
| /*+ "dependentVariable" */ DOUBLE PRECISION, |
| /*+ "independentVariables" */ DOUBLE PRECISION[]) ( |
| |
| SFUNC=MADLIB_SCHEMA.linregr_transition, |
| STYPE=float8[], |
| FINALFUNC=MADLIB_SCHEMA.linregr_final, |
| m4_ifdef(`GREENPLUM',`prefunc=MADLIB_SCHEMA.linregr_merge_states,') |
| INITCOND='{0,0,0,0,0}' |
| ); |