| /* ----------------------------------------------------------------------- *//** |
| * |
| * @file linear.sql_in |
| * |
| * @brief SQL functions for linear regression |
| * @date January 2011 |
| * |
| * @sa For a brief introduction to linear regression, see the module |
| * description \ref grp_linreg. |
| * |
| *//* ----------------------------------------------------------------------- */ |
| |
| m4_include(`SQLCommon.m4') |
| |
| /** |
| @addtogroup grp_linreg |
| |
| |
| <div class="toc"><b>Contents</b> |
| <ul> |
| <li class="level1"><a href="#train">Training Function</a></li> |
| <li class="level1"><a href="#predict">Prediction Function</a></li> |
| <li class="level1"><a href="#examples">Examples</a></li> |
| <li class="level1"><a href="#background">Technical Background</a></li> |
| <li class="level1"><a href="#literature">Literature</a></li> |
| <li class="level1"><a href="#related">Related Topics</a></li> |
| </ul></div> |
| |
| @brief Also called Ordinary Least Squares Regression, models linear relationship between a dependent variable and one or more independent variables. |
| |
| Linear regression models a linear relationship of a scalar dependent variable |
| \f$ y \f$ to one or more explanatory independent variables \f$ x \f$ and builds |
| a model of coefficients. |
| |
| @anchor train |
| @par Training Function |
| |
| The linear regression training function has the following syntax. |
| <pre class="syntax"> |
| linregr_train( source_table, |
| out_table, |
| dependent_varname, |
| independent_varname, |
| grouping_cols, |
| heteroskedasticity_option |
| ) |
| </pre> |
| |
| \b Arguments |
| <DL class="arglist"> |
| <DT>source_table</DT> |
| <DD>TEXT. Name of the table containing the training data.</DD> |
| |
| <DT>out_table</DT> |
| <DD>TEXT. Name of the generated table containing the output model. |
| |
| The output table contains the following columns: |
| <table class="output"> |
| <tr> |
| <th>\<...></th> |
| <td>Any grouping columns provided during training. |
| Present only if the grouping option is used.</td> |
| </tr> |
| <tr> |
| <th>coef</th> |
| <td>FLOAT8[]. Vector of the coefficients of the regression.</td> |
| </tr> |
| <tr> |
| <th>r2</th> |
| <td>FLOAT8. R-squared coefficient of determination of the model.</td> |
| </tr> |
| <tr> |
| <th>std_err</th> |
| <td>FLOAT8[]. Vector of the standard error of the coefficients.</td> |
| </tr> |
| <tr> |
| <th>t_stats</th> |
| <td>FLOAT8[]. Vector of the t-statistics of the coefficients.</td> |
| </tr> |
| <tr> |
| <th>p_values</th> |
| <td>FLOAT8[]. Vector of the p-values of the coefficients.</td> |
| </tr> |
| <tr> |
| <th>condition_no</th> |
| <td>FLOAT8 array. The condition number of the \f$X^{*}X\f$ |
| matrix. A high condition number is usually an indication that there may be |
| some numeric instability in the result yielding a less reliable model. A high |
| condition number often results when there is a significant amount of |
| colinearity in the underlying design matrix, in which case other regression |
| techniques, such as elastic net regression, may be more appropriate.</td> |
| </tr> |
| <tr> |
| <th>bp_stats</th> |
| <td>FLOAT8. The Breush-Pagan statistic of heteroskedacity. Present |
| only if the heteroskedacity argument was set to True when the model was |
| trained.</td> |
| </tr> |
| <tr> |
| <th>bp_p_value</th> |
| <td>FLOAT8. The Breush-Pagan calculated p-value. Present only if |
| the heteroskedacity parameter was set to True when the model was |
| trained.</td> |
| </tr> |
| <tr> |
| <th>num_rows_processed</th> |
| <td>BIGINT. The number of rows that are actually used in each group.</td> |
| </tr> |
| <tr> |
| <th>num_missing_rows_skipped</th> |
| <td>INTEGER. The number of rows that have NULL values in the dependent and independent variables, and were skipped in the computation for each group.</td></tr> |
| <tr> |
| <th>variance_covariance</th> |
| <td>FLOAT[]. Variance/covariance matrix.</td></tr> |
| </table> |
| |
| A summary table named \<out_table\>_summary is created together with the output table. It has the following columns: |
| <table class="output"> |
| <tr> |
| <th>method</th> |
| <td>'linregr' for linear regression.</td> |
| </tr> |
| <tr> |
| <th>source_table</th> |
| <td>The data source table name</td></tr> |
| <tr><th>out_table</th> |
| <td>The output table name</td></tr> |
| <tr><th>dependent_varname</th> |
| <td>The dependent variable</td></tr> |
| <tr><th>independent_varname</th> |
| <td>The independent variables</td></tr> |
| <tr><th>num_rows_processed</th> |
| <td>The total number of rows that were used in the computation.</td></tr> |
| <tr><th>num_missing_rows_skipped</th> |
| <td>The total number of rows that were skipped because of NULL values in them.</td> |
| </tr> |
| <tr> |
| <th>grouping_cols</th> |
| <td>Names of the grouping columns.</td> |
| </tr> |
| </table> |
| </dd> |
| <dt></dt> |
| <dd>@note For p-values, we just return the computation result directly. |
| Other statistical packages like 'R' produce the same result, but on printing the |
| result to screen, another format function is used and any p-value that is |
| smaller than the machine epsilon (the smallest positive floating-point number |
| 'x' such that '1 + x != 1') will be printed on screen as "< xxx" (xxx is the |
| value of the machine epsilon). Although the result may look different, they are |
| in fact the same. |
| </dd> |
| |
| <DT>dependent_varname</DT> |
| <DD>TEXT. Expression to evaluate for the dependent variable.</DD> |
| |
| <DT>independent_varname</DT> |
| <DD>TEXT. Expression list to evaluate for the independent variables. An intercept variable is not assumed. It is common to provide an explicit intercept term by including a single constant <tt>1</tt> term in the independent variable list.</DD> |
| |
| <DT>grouping_cols (optional)</DT> |
| <DD>TEXT, default: NULL. An expression list used to group the input dataset into discrete groups, running one regression per group. Similar to the SQL <tt>GROUP BY</tt> clause. When this value is null, no grouping is used and a |
| single result model is generated for the whole data set.</DD> |
| |
| <DT>heteroskedasticity_option (optional)</DT> |
| <DD>BOOLEAN, default: FALSE. When TRUE, the heteroskedasticity of the model is also calculated and returned with the results.</DD> |
| </DL> |
| |
| @anchor warning |
| @warning The aggregate 'linregr' has been deprecated in favor of the function |
| 'linregr_train'. If the aggregate 'linregr' is used to output the results of |
| linear regression to a table, it is recommended to follow the general pattern |
| shown below (replace text within '<...>' with the appropriate variable names). |
| <pre class="syntax"> |
| CREATE TABLE \<output table\> AS |
| SELECT (r).* |
| FROM ( |
| SELECT linregr(\<dependent variable\>, \<independent variable\>) as r |
| FROM \<source table\> |
| ) q; |
| </pre> |
| |
| |
| @anchor predict |
| @par Prediction Function |
| The prediction function is as follows: |
| <pre class="syntax"> |
| linregr_predict(coef, col_ind) |
| </pre> |
| \b Arguments |
| <dl class="arglist"> |
| <dt>coef</dt> |
| <dd>FLOAT8[]. Vector of the coefficients of regression from training.</dd> |
| <dt>col_ind</dt> |
| <dd>FLOAT8[]. An array containing the independent variable column names, |
| as was used for the training. </dd> |
| |
| @anchor examples |
| @par Examples |
| -# Create an input data set. |
| <pre class="example"> |
| DROP TABLE IF EXISTS houses; |
| CREATE TABLE houses (id INT, tax INT, bedroom INT, bath FLOAT, price INT, |
| size INT, lot INT); |
| INSERT INTO houses VALUES |
| (1 , 590 , 2 , 1 , 50000 , 770 , 22100), |
| (2 , 1050 , 3 , 2 , 85000 , 1410 , 12000), |
| (3 , 20 , 3 , 1 , 22500 , 1060 , 3500), |
| (4 , 870 , 2 , 2 , 90000 , 1300 , 17500), |
| (5 , 1320 , 3 , 2 , 133000 , 1500 , 30000), |
| (6 , 1350 , 2 , 1 , 90500 , 820 , 25700), |
| (7 , 2790 , 3 , 2.5 , 260000 , 2130 , 25000), |
| (8 , 680 , 2 , 1 , 142500 , 1170 , 22000), |
| (9 , 1840 , 3 , 2 , 160000 , 1500 , 19000), |
| (10 , 3680 , 4 , 2 , 240000 , 2790 , 20000), |
| (11 , 1660 , 3 , 1 , 87000 , 1030 , 17500), |
| (12 , 1620 , 3 , 2 , 118600 , 1250 , 20000), |
| (13 , 3100 , 3 , 2 , 140000 , 1760 , 38000), |
| (14 , 2070 , 2 , 3 , 148000 , 1550 , 14000), |
| (15 , 650 , 3 , 1.5 , 65000 , 1450 , 12000); |
| </pre> |
| -# Train a regression model. First, we generate a single regression for all data. |
| <pre class="example"> |
| DROP TABLE IF EXISTS houses_linregr, houses_linregr_summary; |
| SELECT madlib.linregr_train( 'houses', |
| 'houses_linregr', |
| 'price', |
| 'ARRAY[1, tax, bath, size]' |
| ); |
| </pre> |
| (Note that in this example we are dynamically creating the array of independent variables |
| from column names. If you have large numbers of independent variables beyond the PostgreSQL |
| limit of maximum columns per table, you would pre-build the arrays and store them in a |
| single column.) |
| -# Next we generate three output models, one for each value of "bedroom". |
| <pre class="example"> |
| DROP TABLE IF EXISTS houses_linregr_bedroom, houses_linregr_bedroom_summary; |
| SELECT madlib.linregr_train( 'houses', |
| 'houses_linregr_bedroom', |
| 'price', |
| 'ARRAY[1, tax, bath, size]', |
| 'bedroom' |
| ); |
| </pre> |
| -# Examine the resulting models. |
| <pre class="example"> |
| -- Set extended display on for easier reading of output |
| \\x ON |
| SELECT * FROM houses_linregr; |
| </pre> |
| Result: |
| <pre class="result"> |
| -[ RECORD 1 ]+--------------------------------------------------------------------------- |
| coef | {-12849.4168959872,28.9613922651765,10181.6290712648,50.516894915354} |
| r2 | 0.768577580597443 |
| std_err | {33453.0344331391,15.8992104963997,19437.7710925923,32.928023174087} |
| t_stats | {-0.38410317968819,1.82156166004184,0.523806408809133,1.53416118083605} |
| p_values | {0.708223134615422,0.0958005827189772,0.610804093526536,0.153235085548186} |
| condition_no | 9002.50457085737 |
| num_rows_processed | 15 |
| num_missing_rows_skipped | 0 |
| variance_covariance | {{1119105512.78479,217782.067878023,-283344228.394562,-616679.69319088}, ... |
| </pre> |
| Alternatively you can unnest the results for easier reading of output. |
| <pre class="example"> |
| \\x OFF |
| SELECT unnest(ARRAY['intercept','tax','bath','size']) as attribute, |
| unnest(coef) as coefficient, |
| unnest(std_err) as standard_error, |
| unnest(t_stats) as t_stat, |
| unnest(p_values) as pvalue |
| FROM houses_linregr; |
| </pre> |
| Result: |
| <pre class="result"> |
| attribute | coefficient | standard_error | t_stat | pvalue |
| -----------+-------------------+------------------+-------------------+-------------------- |
| intercept | -12849.4168959872 | 33453.0344331391 | -0.38410317968819 | 0.708223134615422 |
| tax | 28.9613922651765 | 15.8992104963997 | 1.82156166004184 | 0.0958005827189772 |
| bath | 10181.6290712648 | 19437.7710925923 | 0.523806408809133 | 0.610804093526536 |
| size | 50.516894915354 | 32.928023174087 | 1.53416118083605 | 0.153235085548186 |
| (4 rows) |
| </pre> |
| -# View the results grouped by bedroom. |
| <pre class="example"> |
| \\x ON |
| SELECT * FROM houses_linregr_bedroom ORDER BY bedroom; |
| </pre> |
| Result: |
| <pre class="result"> |
| -[ RECORD 1 ]------------+---------------------------------------------------------------- |
| bedroom | 4 |
| coef | {0.0112536020318378,41.4132554771633,0.0225072040636757,31.3975496688276} |
| r2 | 1 |
| std_err | {0,0,0,0} |
| t_stats | {Infinity,Infinity,Infinity,Infinity} |
| p_values | |
| condition_no | Infinity |
| num_rows_processed | 1 |
| num_missing_rows_skipped | 0 |
| variance_covariance | {{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}} |
| -[ RECORD 2 ]------------+---------------------------------------------------------------- |
| bedroom | 3 |
| coef | {-88155.8292501601,27.1966436294429,41404.0293363612,62.637521075324} |
| r2 | 0.841699901311252 |
| std_err | {57867.9999702625,17.8272309154689,43643.1321511114,70.8506824863954} |
| t_stats | {-1.52339512849005,1.52556747362508,0.948695185143966,0.884077878676067} |
| p_values | {0.188161432894871,0.187636685729869,0.386340032374927,0.417132778705789} |
| condition_no | 11722.6225642147 |
| num_rows_processed | 9 |
| num_missing_rows_skipped | 0 |
| variance_covariance | {{3348705420.5583,433697.545104226,-70253017.45773,-2593488.13800193}, ... |
| -[ RECORD 3 ]------------+---------------------------------------------------------------- |
| bedroom | 2 |
| coef | {-84242.0345406597,55.4430144648696,-78966.9753675319,225.611910021192} |
| r2 | 0.968809546465313 |
| std_err | {35018.9991665742,19.5731125320686,23036.8071292552,49.0448678148784} |
| t_stats | {-2.40560942761235,2.83261103077151,-3.42786111480046,4.60011251070697} |
| p_values | {0.250804617665239,0.21605133377602,0.180704400437373,0.136272031474122} |
| condition_no | 10086.1048721726 |
| num_rows_processed | 5 |
| num_missing_rows_skipped | 0 |
| variance_covariance | {{1226330302.62852,-300921.595596804,551696673.397849,-1544160.63236119}, ... |
| </pre> |
| |
| -# Compare predicted price with actual. (This example uses the original data table to perform the prediction. Typically a different test dataset with the same features as the original training dataset would be used for prediction.) |
| <pre class="example"> |
| \\x OFF |
| SELECT houses.*, |
| madlib.linregr_predict( m.coef, |
| ARRAY[1,tax,bath,size] |
| ) as predict, |
| price - |
| madlib.linregr_predict( m.coef, |
| ARRAY[1,tax,bath,size] |
| ) as residual |
| FROM houses, houses_linregr m ORDER BY id; |
| </pre> |
| Result: |
| <pre class="result"> |
| id | tax | bedroom | bath | price | size | lot | predict | residual |
| ----+------+---------+------+--------+------+-------+------------------+------------------- |
| 1 | 590 | 2 | 1 | 50000 | 770 | 22100 | 53317.4426965542 | -3317.44269655424 |
| 2 | 1050 | 3 | 2 | 85000 | 1410 | 12000 | 109152.124955627 | -24152.1249556268 |
| 3 | 20 | 3 | 1 | 22500 | 1060 | 3500 | 51459.3486308563 | -28959.3486308563 |
| 4 | 870 | 2 | 2 | 90000 | 1300 | 17500 | 98382.215907206 | -8382.21590720605 |
| 5 | 1320 | 3 | 2 | 133000 | 1500 | 30000 | 121518.221409606 | 11481.7785903937 |
| 6 | 1350 | 2 | 1 | 90500 | 820 | 25700 | 77853.9455638561 | 12646.0544361439 |
| 7 | 2790 | 3 | 2.5 | 260000 | 2130 | 25000 | 201007.926371721 | 58992.0736282788 |
| 8 | 680 | 2 | 1 | 142500 | 1170 | 22000 | 76130.7259665617 | 66369.2740334383 |
| 9 | 1840 | 3 | 2 | 160000 | 1500 | 19000 | 136578.145387498 | 23421.8546125019 |
| 10 | 3680 | 4 | 2 | 240000 | 2790 | 20000 | 255033.90159623 | -15033.9015962295 |
| 11 | 1660 | 3 | 1 | 87000 | 1030 | 17500 | 97440.5250982852 | -10440.5250982852 |
| 12 | 1620 | 3 | 2 | 118600 | 1250 | 20000 | 117577.415360321 | 1022.58463967926 |
| 13 | 3100 | 3 | 2 | 140000 | 1760 | 38000 | 186203.892319613 | -46203.8923196126 |
| 14 | 2070 | 2 | 3 | 148000 | 1550 | 14000 | 155946.739425521 | -7946.73942552117 |
| 15 | 650 | 3 | 1.5 | 65000 | 1450 | 12000 | 94497.4293105379 | -29497.4293105379 |
| (15 rows) |
| </pre> |
| |
| -# Compare predicted price with actual with grouping. |
| It means a different model is used depending on the number of bedrooms. |
| <pre class="example"> |
| \\x OFF |
| SELECT houses.*, |
| madlib.linregr_predict( m.coef, |
| ARRAY[1,tax,bath,size] |
| ) as predict, |
| price - |
| madlib.linregr_predict( m.coef, |
| ARRAY[1,tax,bath,size] |
| ) as residual |
| FROM houses, houses_linregr_bedroom m |
| WHERE houses.bedroom = m.bedroom |
| ORDER BY id; |
| </pre> |
| Result: |
| <pre class="result"> |
| id | tax | bedroom | bath | price | size | lot | predict | residual |
| ----+------+---------+------+--------+------+-------+------------------+------------------- |
| 1 | 590 | 2 | 1 | 50000 | 770 | 22100 | 43223.5393423978 | 6776.46065760222 |
| 2 | 1050 | 3 | 2 | 85000 | 1410 | 12000 | 111527.609949684 | -26527.609949684 |
| 3 | 20 | 3 | 1 | 22500 | 1060 | 3500 | 20187.9052986341 | 2312.09470136587 |
| 4 | 870 | 2 | 2 | 90000 | 1300 | 17500 | 99354.9203362612 | -9354.92033626116 |
| 5 | 1320 | 3 | 2 | 133000 | 1500 | 30000 | 124508.080626412 | 8491.91937358756 |
| 6 | 1350 | 2 | 1 | 90500 | 820 | 25700 | 96640.8258367579 | -6140.8258367579 |
| 7 | 2790 | 3 | 2.5 | 260000 | 2130 | 25000 | 224650.799707327 | 35349.2002926733 |
| 8 | 680 | 2 | 1 | 142500 | 1170 | 22000 | 138458.174652714 | 4041.82534728572 |
| 9 | 1840 | 3 | 2 | 160000 | 1500 | 19000 | 138650.335313722 | 21349.6646862777 |
| 10 | 3680 | 4 | 2 | 240000 | 2790 | 20000 | 240000 | 0 |
| 11 | 1660 | 3 | 1 | 87000 | 1030 | 17500 | 62911.2752186594 | 24088.7247813406 |
| 12 | 1620 | 3 | 2 | 118600 | 1250 | 20000 | 117007.693446414 | 1592.30655358579 |
| 13 | 3100 | 3 | 2 | 140000 | 1760 | 38000 | 189203.861766403 | -49203.8617664034 |
| 14 | 2070 | 2 | 3 | 148000 | 1550 | 14000 | 143322.539831869 | 4677.46016813093 |
| 15 | 650 | 3 | 1.5 | 65000 | 1450 | 12000 | 82452.4386727394 | -17452.4386727394 |
| (15 rows) |
| </pre> |
| |
| |
| @anchor notes |
| @par Note |
| All table names can be optionally schema qualified (current_schemas() would be |
| searched if a schema name is not provided) and all table and column names |
| should follow case-sensitivity and quoting rules per the database. |
| (For instance, 'mytable' and 'MyTable' both resolve to the same entity, i.e. 'mytable'. |
| If mixed-case or multi-byte characters are desired for entity names then the |
| string should be double-quoted; in this case the input would be '"MyTable"'). |
| |
| |
| @anchor background |
| @par Technical Background |
| |
| Ordinary least-squares (OLS) linear regression refers to a stochastic model in |
| which the conditional mean of the dependent variable (usually denoted \f$ Y \f$) |
| is an affine function of the vector of independent variables (usually denoted |
| \f$ \boldsymbol x \f$). That is, |
| \f[ |
| E[Y \mid \boldsymbol x] = \boldsymbol c^T \boldsymbol x |
| \f] |
| for some unknown vector of coefficients \f$ \boldsymbol c \f$. The assumption is |
| that the residuals are i.i.d. distributed Gaussians. That is, the (conditional) |
| probability density of \f$ Y \f$ is given by |
| \f[ |
| f(y \mid \boldsymbol x) |
| = \frac{1}{\sqrt{2 \pi \sigma^2}} |
| \cdot \exp\left(-\frac{1}{2 \sigma^2} |
| \cdot (y - \boldsymbol x^T \boldsymbol c)^2 \right) |
| \,. |
| \f] |
| OLS linear regression finds the vector of coefficients \f$ \boldsymbol c \f$ |
| that maximizes the likelihood of the observations. |
| |
| Let |
| - \f$ \boldsymbol y \in \mathbf R^n \f$ denote the vector of observed dependent |
| variables, with \f$ n \f$ rows, containing the observed values of the |
| dependent variable, |
| - \f$ X \in \mathbf R^{n \times k} \f$ denote the design matrix with \f$ k \f$ |
| columns and \f$ n \f$ rows, containing all observed vectors of independent |
| variables. |
| \f$ \boldsymbol x_i \f$ as rows, |
| - \f$ X^T \f$ denote the transpose of \f$ X \f$, |
| - \f$ X^+ \f$ denote the pseudo-inverse of \f$ X \f$. |
| |
| Maximizing the likelihood is equivalent to maximizing the log-likelihood |
| \f$ \sum_{i=1}^n \log f(y_i \mid \boldsymbol x_i) \f$, which simplifies to |
| minimizing the <b>residual sum of squares</b> \f$ RSS \f$ (also called sum of |
| squared residuals or sum of squared errors of prediction), |
| \f[ |
| RSS = \sum_{i=1}^n ( y_i - \boldsymbol c^T \boldsymbol x_i )^2 |
| = (\boldsymbol y - X \boldsymbol c)^T (\boldsymbol y - X \boldsymbol c) |
| \,. |
| \f] |
| The first-order conditions yield that the \f$ RSS \f$ is minimized at |
| \f[ |
| \boldsymbol c = (X^T X)^+ X^T \boldsymbol y |
| \,. |
| \f] |
| |
| Computing the <b>total sum of squares</b> \f$ TSS \f$, the <b>explained |
| sum of squares</b> \f$ ESS \f$ (also called the regression sum of |
| squares), and the <b>coefficient of determination</b> \f$ R^2 \f$ is |
| done according to the following formulas: |
| \f{align*}{ |
| ESS & = \boldsymbol y^T X \boldsymbol c |
| - \frac{ \| y \|_1^2 }{n} \\ |
| TSS & = \sum_{i=1}^n y_i^2 |
| - \frac{ \| y \|_1^2 }{n} \\ |
| R^2 & = \frac{ESS}{TSS} |
| \f} |
| Note: The last equality follows from the definition |
| \f$ R^2 = 1 - \frac{RSS}{TSS} \f$ and the fact that for linear regression |
| \f$ TSS = RSS + ESS \f$. A proof of the latter can be found, e.g., at: |
| http://en.wikipedia.org/wiki/Sum_of_squares |
| |
| We estimate the variance |
| \f$ Var[Y - \boldsymbol c^T \boldsymbol x \mid \boldsymbol x] \f$ as |
| \f[ |
| \sigma^2 = \frac{RSS}{n - k} |
| \f] |
| and compute the t-statistic for coefficient \f$ i \f$ as |
| \f[ |
| t_i = \frac{c_i}{\sqrt{\sigma^2 \cdot \left( (X^T X)^{-1} \right)_{ii} }} |
| \,. |
| \f] |
| |
| The \f$ p \f$-value for coefficient \f$ i \f$ gives the probability of seeing a |
| value at least as extreme as the one observed, provided that the null hypothesis |
| (\f$ c_i = 0 \f$) is true. Letting \f$ F_\nu \f$ denote the |
| cumulative density function of student-t with \f$ \nu \f$ degrees of freedom, |
| the \f$ p \f$-value for coefficient \f$ i \f$ |
| is therefore |
| \f[ |
| p_i = \Pr(|T| \geq |t_i|) = 2 \cdot (1 - F_{n - k}( |t_i| )) |
| \f] |
| where \f$ T \f$ is a student-t distributed random variable with mean 0. |
| |
| The condition number [2] \f$ \kappa(X) = \|X\|_2\cdot\|X^{-1}\|_2\f$ is computed |
| as the product of two spectral norms [3]. The spectral norm of a matrix \f$X\f$ |
| is the largest singular value of \f$X\f$ i.e. the square root of the largest |
| eigenvalue of the positive-semidefinite matrix \f$X^{*}X\f$: |
| |
| \f[ |
| \|X\|_2 = \sqrt{\lambda_{\max}\left(X^{*}X\right)}\ , |
| \f] |
| where \f$X^{*}\f$ is the conjugate transpose of \f$X\f$. |
| The condition number of a linear regression problem |
| is a worst-case measure of how sensitive the |
| result is to small perturbations of the input. A large condition number (say, |
| more than 1000) indicates the presence of significant multicollinearity. |
| |
| @anchor literature |
| @literature |
| |
| [1] Cosma Shalizi: Statistics 36-350: Data Mining, Lecture Notes, 21 October |
| 2009, http://www.stat.cmu.edu/~cshalizi/350/lectures/17/lecture-17.pdf |
| |
| [2] Wikipedia: Condition Number, http://en.wikipedia.org/wiki/Condition_number. |
| |
| [3] Wikipedia: Spectral Norm, |
| http://en.wikipedia.org/wiki/Spectral_norm#Spectral_norm |
| |
| [4] Wikipedia: Breusch–Pagan test, |
| http://en.wikipedia.org/wiki/Breusch%E2%80%93Pagan_test |
| |
| [5] Wikipedia: Heteroscedasticity-consistent standard errors, |
| http://en.wikipedia.org/wiki/Heteroscedasticity-consistent_standard_errors |
| |
| |
| @anchor related |
| @par Related Topics |
| |
| @ref grp_robust |
| |
| @ref grp_clustered_errors |
| |
| @ref grp_validation |
| |
| File linear.sql_in, source file for the SQL functions |
| |
| |
| @internal |
| @sa Namespace \ref madlib::modules::regress |
| documenting the implementation in C++ |
| @endinternal |
| */ |
| |
| --------------------------------------------------------------------------- |
| -- User facing functions |
| --------------------------------------------------------------------------- |
| /** |
| * @brief Linear regression training function with grouping support. |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linregr_train( |
| source_table VARCHAR, -- name of input table |
| out_table VARCHAR, -- name of output table |
| dependent_varname VARCHAR, -- name of dependent variable |
| independent_varname VARCHAR, -- name of independent variable |
| grouping_cols VARCHAR, -- names of columns to group-by |
| heteroskedasticity_option BOOLEAN -- do heteroskedasticity test or not |
| ) RETURNS VOID AS $$ |
| PythonFunction(regress, linear, linregr_train) |
| $$ LANGUAGE plpythonu |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linregr_train( |
| source_table VARCHAR, -- name of input table |
| out_table VARCHAR, -- name of output table |
| dependent_varname VARCHAR, -- name of dependent variable |
| independent_varname VARCHAR, -- name of independent variable |
| grouping_cols VARCHAR -- names of columns to group-by |
| ) RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.linregr_train($1, $2, $3, $4, $5, FALSE); |
| $$ LANGUAGE sql |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linregr_train( |
| source_table VARCHAR, -- name of input table |
| out_table VARCHAR, -- name of output table |
| dependent_varname VARCHAR, -- name of dependent variable |
| independent_varname VARCHAR -- name of independent variable |
| ) RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.linregr_train($1, $2, $3, $4, NULL, FALSE); |
| $$ LANGUAGE sql |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| --------------------------------------------------------------------------- |
| |
| ----------------------------------------------------------------------- |
| -- Online help function |
| ----------------------------------------------------------------------- |
| |
| ---------------------------------------------------------------------- |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linregr_train() |
| RETURNS VARCHAR AS $$ |
| BEGIN |
| RETURN MADLIB_SCHEMA.linregr_train(''); |
| END; |
| $$ LANGUAGE plpgsql IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `'); |
| |
| ---------------------------------------------------------------------- |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linregr_train( |
| message VARCHAR -- usage string |
| ) |
| RETURNS VARCHAR AS $$ |
| PythonFunction(regress, linear, linregr_help_message) |
| $$ LANGUAGE plpythonu IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `'); |
| |
| ---------------------------------------------------------------------- |
| |
| |
| -- Deprecated functions -------------------------------------------------------- |
| |
| --------------------------------------------------------------------------- |
| -- Result Types |
| --------------------------------------------------------------------------- |
| |
| DROP TYPE IF EXISTS MADLIB_SCHEMA.linregr_result CASCADE; |
| CREATE TYPE MADLIB_SCHEMA.linregr_result AS ( |
| coef DOUBLE PRECISION[], |
| r2 DOUBLE PRECISION, |
| std_err DOUBLE PRECISION[], |
| t_stats DOUBLE PRECISION[], |
| p_values DOUBLE PRECISION[], |
| condition_no DOUBLE PRECISION, |
| num_processed BIGINT, |
| vcov DOUBLE PRECISION[] |
| ); |
| |
| DROP TYPE IF EXISTS MADLIB_SCHEMA.heteroskedasticity_test_result CASCADE; |
| CREATE TYPE MADLIB_SCHEMA.heteroskedasticity_test_result AS ( |
| bp_stats DOUBLE PRECISION, |
| bp_p_value DOUBLE PRECISION |
| ); |
| --------------------------------------------------------------------------- |
| |
| |
| --------------------------------------------------------------------------- |
| -- Functions for user-defined aggregates |
| --------------------------------------------------------------------------- |
| -- normal |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linregr_transition( |
| state MADLIB_SCHEMA.bytea8, |
| y DOUBLE PRECISION, |
| x DOUBLE PRECISION[]) |
| RETURNS MADLIB_SCHEMA.bytea8 |
| AS 'MODULE_PATHNAME' |
| LANGUAGE C |
| IMMUTABLE STRICT |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linregr_merge_states( |
| state1 MADLIB_SCHEMA.bytea8, |
| state2 MADLIB_SCHEMA.bytea8) |
| RETURNS MADLIB_SCHEMA.bytea8 |
| AS 'MODULE_PATHNAME' |
| LANGUAGE C |
| IMMUTABLE STRICT |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linregr_final( |
| state MADLIB_SCHEMA.bytea8) |
| RETURNS MADLIB_SCHEMA.linregr_result |
| AS 'MODULE_PATHNAME' |
| LANGUAGE C IMMUTABLE STRICT |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| -- hetero |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.hetero_linregr_transition( |
| state MADLIB_SCHEMA.bytea8, |
| y DOUBLE PRECISION, |
| x DOUBLE PRECISION[], |
| coef DOUBLE PRECISION[]) |
| RETURNS MADLIB_SCHEMA.bytea8 |
| AS 'MODULE_PATHNAME' |
| LANGUAGE C |
| IMMUTABLE STRICT |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.hetero_linregr_merge_states( |
| state1 MADLIB_SCHEMA.bytea8, |
| state2 MADLIB_SCHEMA.bytea8) |
| RETURNS MADLIB_SCHEMA.bytea8 |
| AS 'MODULE_PATHNAME' |
| LANGUAGE C |
| IMMUTABLE STRICT |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| |
| -- Final functions |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.hetero_linregr_final( |
| state MADLIB_SCHEMA.bytea8) |
| RETURNS MADLIB_SCHEMA.heteroskedasticity_test_result |
| AS 'MODULE_PATHNAME' |
| LANGUAGE C IMMUTABLE STRICT |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| --------------------------------------------------------------------------- |
| |
| |
| --------------------------------------------------------------------------- |
| -- User-defined aggregates |
| --------------------------------------------------------------------------- |
| /** |
| * @brief Compute linear regression coefficients and diagnostic statistics. |
| * |
| * @param dependentVariable Column containing the dependent variable |
| * @param independentVariables Column containing the array of independent variables |
| * |
| * @par |
| * To include an intercept in the model, set one coordinate in the |
| * <tt>independentVariables</tt> array to 1. |
| * |
| * @return A composite value: |
| * - <tt>coef FLOAT8[]</tt> - Array of coefficients, \f$ \boldsymbol c \f$ |
| * - <tt>r2 FLOAT8</tt> - Coefficient of determination, \f$ R^2 \f$ |
| * - <tt>std_err FLOAT8[]</tt> - Array of standard errors, |
| * \f$ \mathit{se}(c_1), \dots, \mathit{se}(c_k) \f$ |
| * - <tt>t_stats FLOAT8[]</tt> - Array of t-statistics, \f$ \boldsymbol t \f$ |
| * - <tt>p_values FLOAT8[]</tt> - Array of p-values, \f$ \boldsymbol p \f$ |
| * - <tt>condition_no FLOAT8</tt> - The condition number of matrix |
| * \f$ X^T X \f$. |
| * |
| * @usage |
| * - Get vector of coefficients \f$ \boldsymbol c \f$ and all diagnostic |
| * statistics:\n |
| * <pre>SELECT (linregr(<em>dependentVariable</em>, |
| * <em>independentVariables</em>)).* |
| *FROM <em>sourceName</em>;</pre> |
| * - Get vector of coefficients \f$ \boldsymbol c \f$:\n |
| * <pre>SELECT (linregr(<em>dependentVariable</em>, |
| * <em>independentVariables</em>)).coef |
| *FROM <em>sourceName</em>;</pre> |
| * - Get a subset of the output columns, e.g., only the array of coefficients |
| * \f$ \boldsymbol c \f$, the coefficient of determination \f$ R^2 \f$, and |
| * the array of p-values \f$ \boldsymbol p \f$: |
| * <pre>SELECT (lr).coef, (lr).r2, (lr).p_values |
| *FROM ( |
| * SELECT linregr( <em>dependentVariable</em>, |
| * <em>independentVariables</em>) AS lr |
| * FROM <em>sourceName</em> |
| *) AS subq;</pre> |
| */ |
| DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.linregr(DOUBLE PRECISION, DOUBLE PRECISION []); |
| CREATE AGGREGATE MADLIB_SCHEMA.linregr( |
| /*+ "dependentVariable" */ DOUBLE PRECISION, |
| /*+ "independentVariables" */ DOUBLE PRECISION[]) ( |
| |
| SFUNC=MADLIB_SCHEMA.linregr_transition, |
| STYPE=MADLIB_SCHEMA.bytea8, |
| FINALFUNC=MADLIB_SCHEMA.linregr_final, |
| m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.linregr_merge_states,') |
| INITCOND='' |
| ); |
| |
| /** |
| * @brief Compute studentized Breuch-Pagan heteroskedasticity test for |
| * linear regression. |
| * |
| * @param dependentVariable Column containing the dependent variable |
| * @param independentVariables Column containing the array of independent variables |
| * @param olsCoefficients Column containing the array of the OLS coefficients (as obtained by linregr) |
| * |
| * @par |
| * To include an intercept in the model, set one coordinate in the |
| * <tt>independentVariables</tt> array to 1. |
| * |
| * @return A composite value: |
| * - <tt>test_statistic FLOAT8[]</tt> - Prob > test_statistc |
| * - <tt>p_value FLOAT8[]</tt> - Prob > test_statistc |
| * |
| * @usage |
| * <pre> SELECT (heteoskedasticity_test_linregr(<em>dependentVariable</em>, |
| * <em>independentVariables</em>, coef)).* |
| * FROM ( |
| * SELECT linregr(<em>dependentVariable</em>, <em>independentVariables</em>).coef |
| * ) AS ols_coef, <em>sourceName</em> as src; |
| * </pre> |
| */ |
| DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.heteroskedasticity_test_linregr( |
| DOUBLE PRECISION, DOUBLE PRECISION [], DOUBLE PRECISION []); |
| CREATE AGGREGATE MADLIB_SCHEMA.heteroskedasticity_test_linregr( |
| /*+ "dependentVariable" */ DOUBLE PRECISION, |
| /*+ "independentVariables" */ DOUBLE PRECISION[], |
| /*+ "olsCoefficients" */ DOUBLE PRECISION[]) ( |
| |
| SFUNC=MADLIB_SCHEMA.hetero_linregr_transition, |
| STYPE=MADLIB_SCHEMA.bytea8, |
| FINALFUNC=MADLIB_SCHEMA.hetero_linregr_final, |
| m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.hetero_linregr_merge_states,') |
| INITCOND='' |
| ); |
| --------------------------------------------------------------------------- |
| |
| /** |
| * @brief Predict the boolean value of a dependent variable for a specific |
| * independent variable value in a linear regression model |
| * |
| * @param coef Coefficients obtained by running linear regression. |
| * @param col_ind Independent variable array |
| * @returns DOUBLE PRECISION Predicted value |
| * |
| * This function computes the dot product of the independent variables and the |
| * coefficients. This requires the length of the two vectors to be the same. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linregr_predict( |
| coef DOUBLE PRECISION[], |
| col_ind_var DOUBLE PRECISION[] |
| ) RETURNS DOUBLE PRECISION |
| AS 'MODULE_PATHNAME', 'array_dot' |
| LANGUAGE c IMMUTABLE STRICT; |
| |
| -- Help messages ------------------------------------------------------- |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linregr_predict( |
| message TEXT |
| ) RETURNS TEXT AS $$ |
| PythonFunction(regress, linear, linregr_predict_help_message) |
| $$ LANGUAGE plpythonu IMMUTABLE; |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linregr_predict() |
| RETURNS TEXT |
| AS $$ |
| SELECT MADLIB_SCHEMA.linregr_predict(''::TEXT); |
| $$ |
| LANGUAGE sql IMMUTABLE; |
| ------------------------------------------------------------- |