| /* ----------------------------------------------------------------------- *//** |
| * |
| * @file linear.sql_in |
| * |
| * @brief SQL functions for linear regression |
| * @date January 2011 |
| * |
| * @sa For a brief introduction to linear regression, see the module |
| * description \ref grp_linreg. |
| * |
| *//* ----------------------------------------------------------------------- */ |
| |
| m4_include(`SQLCommon.m4') |
| |
| /** |
| @addtogroup grp_linreg |
| |
| <div class="toc"><b>Contents</b><ul> |
| <li class="level1"><a href="#about">About</a></li> |
| <li class="level1"><a href="#train">Usage</a></li> |
| <li class="level2"><a href="#train">Training Function</a></li> |
| <li class="level2"><a href="#output">Output Table</a></li> |
| <li class="level2"><a href="#predict">Prediction Function</a></li> |
| <li class="level1"><a href="#examples">Examples</a></li> |
| <li class="level1"><a href="#seealso">See Also</a></li> |
| <li class="level1"><a href="#background">Technical Background</a></li> |
| <li class="level1"><a href="#literature">Literature</a></li> |
| </ul></div> |
| |
| @anchor about |
| @about |
| Ordinary Least Squares Regression, also called Linear Regression, is a |
| statistical model used to fit linear models. |
| |
| It models a linear relationship of a scalar dependent variable \f$ y \f$ to one |
| or more explanatory independent variables \f$ x \f$ to build a |
| model of coefficients. |
| |
| |
| @anchor train |
| @par Training Function |
| @verbatim |
| linregr_train(source_table, out_table, |
| dependent_varname, |
| independent_varname, |
| input_group_cols := NULL, |
| heteroskedasticity_option := NULL) |
| @endverbatim |
| |
| <DL class="arglist"> |
| <DT>source_table</DT> |
| <DD>Text value. The name of the table containing the training data.</DD> |
| |
| <DT>out_table</DT> |
| <DD>Text value. Name of the generated table containing the output model.</DD> |
| |
| <DT>dependent_varname</DT> |
| <DD>Text value. Expression to evaluate for the dependent variable.</DD> |
| |
| <DT>independent_varname</DT> |
| <DD>Text value. Expression list to evaluate for the independent variables. An intercept variable is not assumed. It is common to provide an explicit intercept term by including a single constant <tt>1</tt> term in the independent variable list.</DD> |
| |
| <DT>input_group_cols</DT> |
| <DD>Text value. An expression list used to group the input dataset into discrete groups, running one regression per group. Similar to the SQL <tt>GROUP BY</tt> clause. When this value is null, no grouping is used and a single result model is generated. Default value: NULL.</DD> |
| |
| <DT>heteroskedasticity_option</DT> |
| <DD>Boolean value. When True, the heteroskedacity of the model is also calculated and returned with the results. Default value: False.</DD> |
| </DL> |
| |
| @anchor output |
| @par Output Table |
| The output table produced by the linear regression training function contains the following columns. |
| <DL class="arglist"> |
| <DT>\<...></DT> |
| <DD>Any grouping columns provided during training. |
| Present only if the grouping option is used.</DD> |
| |
| <DT>coef</DT> |
| <DD>Float array. Vector of the coefficients of the regression.</DD> |
| |
| <DT>r2</DT> |
| <DD>Float. R-squared coefficient of determination of the model.</DD> |
| |
| <DT>std_err</DT> |
| <DD>Float array. Vector of the standard error of the coefficients.</DD> |
| |
| <DT>t_stats</DT> |
| <DD>Float array. Vector of the t-statistics of the coefficients.</DD> |
| |
| <DT>p_values</DT> |
| <DD>Float array. Vector of the p-values of the coefficients.</DD> |
| |
| <DT>condition_no</DT> |
| <DD>Float array. The condition number of the \f$X^{*}X\f$ |
| matrix. A high condition number is usually an indication that there may be |
| some numeric instability in the result yielding a less reliable model. A high |
| condition number often results when there is a significant amount of |
| colinearity in the underlying design matrix, in which case other regression |
| techniques, such as elastic net regression, may be more appropriate.</DD> |
| |
| <DT>bp_stats</DT> |
| <DD>Float. The Breush-Pagan statistic of heteroskedacity. Present |
| only if the heteroskedacity argument was set to True when the model was |
| trained.</DD> |
| |
| <DT>bp_p_value</DT> |
| <DD>Float. The Breush-Pagan calculated p-value. Present only if |
| the heteroskedacity parameter was set to True when the model was |
| trained.</DD> |
| </DL> |
| |
| @anchor predict |
| @par Prediction Function |
| @verbatim |
| linregr_predict( |
| coef, |
| col_ind |
| ) |
| @endverbatim |
| <dl class="arglist"> |
| <dt>coef</dt> |
| <dd>Float array. Vector of the coefficients of regression.</dd> |
| |
| <dt>col_ind</dt> |
| <dd>Float array. An array containing the independent variable column names. </dd> |
| |
| @anchor examples |
| @par Examples |
| -# Create an input data set. |
| @verbatim |
| CREATE TABLE houses (id INT, tax INT, bedroom INT, bath FLOAT, price INT, |
| size INT, lot INT); |
| COPY houses FROM STDIN WITH DELIMITER '|'; |
| 1 | 590 | 2 | 1 | 50000 | 770 | 22100 |
| 2 | 1050 | 3 | 2 | 85000 | 1410 | 12000 |
| 3 | 20 | 3 | 1 | 22500 | 1060 | 3500 |
| 4 | 870 | 2 | 2 | 90000 | 1300 | 17500 |
| 5 | 1320 | 3 | 2 | 133000 | 1500 | 30000 |
| 6 | 1350 | 2 | 1 | 90500 | 820 | 25700 |
| 7 | 2790 | 3 | 2.5 | 260000 | 2130 | 25000 |
| 8 | 680 | 2 | 1 | 142500 | 1170 | 22000 |
| 9 | 1840 | 3 | 2 | 160000 | 1500 | 19000 |
| 10 | 3680 | 4 | 2 | 240000 | 2790 | 20000 |
| 11 | 1660 | 3 | 1 | 87000 | 1030 | 17500 |
| 12 | 1620 | 3 | 2 | 118600 | 1250 | 20000 |
| 13 | 3100 | 3 | 2 | 140000 | 1760 | 38000 |
| 14 | 2070 | 2 | 3 | 148000 | 1550 | 14000 |
| 15 | 650 | 3 | 1.5 | 65000 | 1450 | 12000 |
| \. |
| @endverbatim |
| -# Train a regression model. |
| @verbatim |
| -- A single regression for all the data |
| SELECT madlib.linregr_train( |
| 'houses', 'houses_linregr', 'price', 'array[1, tax, bath, size]'); |
| |
| -- 3 output models, one for each value of "bedroom" |
| SELECT madlib.linregr_train( |
| 'houses', 'houses_linregr_bedroom', 'price', 'array[1, tax, bath, size]', |
| 'bedroom'); |
| @endverbatim |
| -# Examine the resulting models. |
| @verbatim |
| -- Set extended display on for easier reading of output |
| \x on |
| SELECT * from houses_linregr; |
| SELECT * FROM houses_linregr_bedroom; |
| |
| -- Alternatively you can unnest the results for easier reading of output |
| \x off |
| SELECT unnest(array['intercept','tax','bath','size']) as attribute, |
| unnest(coef) as coefficient, |
| unnest(std_err) as standard_error, |
| unnest(t_stats) as t_stat, |
| unnest(p_values) as pvalue |
| FROM houses_linregr; |
| @endverbatim |
| -# Use the prediction function to evaluate residuals. |
| @verbatim |
| SELECT houses.*, |
| madlib.linregr_predict(array[1,tax,bath,size], m.coef) as predict, |
| price - madlib.linregr_predict(array[1,tax,bath,size], m.coef) |
| as residual |
| FROM houses, houses_linregr m; |
| @endverbatim |
| |
| @anchor seealso |
| @sa File linear.sql_in, documenting the SQL training functions |
| @sa linregr() |
| @sa logregr_train() |
| @sa elastic_net_train() |
| @sa grp_robust |
| @sa grp_clustered_errors |
| @sa grp_validation |
| |
| @anchor background |
| @par Technical Background |
| |
| Ordinary least-squares (OLS) linear regression refers to a stochastic model in |
| which the conditional mean of the dependent variable (usually denoted \f$ Y \f$) |
| is an affine function of the vector of independent variables (usually denoted |
| \f$ \boldsymbol x \f$). That is, |
| \f[ |
| E[Y \mid \boldsymbol x] = \boldsymbol c^T \boldsymbol x |
| \f] |
| for some unknown vector of coefficients \f$ \boldsymbol c \f$. The assumption is |
| that the residuals are i.i.d. distributed Gaussians. That is, the (conditional) |
| probability density of \f$ Y \f$ is given by |
| \f[ |
| f(y \mid \boldsymbol x) |
| = \frac{1}{\sqrt{2 \pi \sigma^2}} |
| \cdot \exp\left(-\frac{1}{2 \sigma^2} |
| \cdot (y - \boldsymbol x^T \boldsymbol c)^2 \right) |
| \,. |
| \f] |
| OLS linear regression finds the vector of coefficients \f$ \boldsymbol c \f$ |
| that maximizes the likelihood of the observations. |
| |
| Let |
| - \f$ \boldsymbol y \in \mathbf R^n \f$ denote the vector of observed dependent |
| variables, with \f$ n \f$ rows, containing the observed values of the |
| dependent variable, |
| - \f$ X \in \mathbf R^{n \times k} \f$ denote the design matrix with \f$ k \f$ |
| columns and \f$ n \f$ rows, containing all observed vectors of independent |
| variables. |
| \f$ \boldsymbol x_i \f$ as rows, |
| - \f$ X^T \f$ denote the transpose of \f$ X \f$, |
| - \f$ X^+ \f$ denote the pseudo-inverse of \f$ X \f$. |
| |
| Maximizing the likelihood is equivalent to maximizing the log-likelihood |
| \f$ \sum_{i=1}^n \log f(y_i \mid \boldsymbol x_i) \f$, which simplifies to |
| minimizing the <b>residual sum of squares</b> \f$ RSS \f$ (also called sum of |
| squared residuals or sum of squared errors of prediction), |
| \f[ |
| RSS = \sum_{i=1}^n ( y_i - \boldsymbol c^T \boldsymbol x_i )^2 |
| = (\boldsymbol y - X \boldsymbol c)^T (\boldsymbol y - X \boldsymbol c) |
| \,. |
| \f] |
| The first-order conditions yield that the \f$ RSS \f$ is minimized at |
| \f[ |
| \boldsymbol c = (X^T X)^+ X^T \boldsymbol y |
| \,. |
| \f] |
| |
| Computing the <b>total sum of squares</b> \f$ TSS \f$, the <b>explained |
| sum of squares</b> \f$ ESS \f$ (also called the regression sum of |
| squares), and the <b>coefficient of determination</b> \f$ R^2 \f$ is |
| done according to the following formulas: |
| \f{align*}{ |
| ESS & = \boldsymbol y^T X \boldsymbol c |
| - \frac{ \| y \|_1^2 }{n} \\ |
| TSS & = \sum_{i=1}^n y_i^2 |
| - \frac{ \| y \|_1^2 }{n} \\ |
| R^2 & = \frac{ESS}{TSS} |
| \f} |
| Note: The last equality follows from the definition |
| \f$ R^2 = 1 - \frac{RSS}{TSS} \f$ and the fact that for linear regression |
| \f$ TSS = RSS + ESS \f$. A proof of the latter can be found, e.g., at: |
| http://en.wikipedia.org/wiki/Sum_of_squares |
| |
| We estimate the variance |
| \f$ Var[Y - \boldsymbol c^T \boldsymbol x \mid \boldsymbol x] \f$ as |
| \f[ |
| \sigma^2 = \frac{RSS}{n - k} |
| \f] |
| and compute the t-statistic for coefficient \f$ i \f$ as |
| \f[ |
| t_i = \frac{c_i}{\sqrt{\sigma^2 \cdot \left( (X^T X)^{-1} \right)_{ii} }} |
| \,. |
| \f] |
| |
| The \f$ p \f$-value for coefficient \f$ i \f$ gives the probability of seeing a |
| value at least as extreme as the one observed, provided that the null hypothesis |
| (\f$ c_i = 0 \f$) is true. Letting \f$ F_\nu \f$ denote the |
| cumulative density function of student-t with \f$ \nu \f$ degrees of freedom, |
| the \f$ p \f$-value for coefficient \f$ i \f$ |
| is therefore |
| \f[ |
| p_i = \Pr(|T| \geq |t_i|) = 2 \cdot (1 - F_{n - k}( |t_i| )) |
| \f] |
| where \f$ T \f$ is a student-t distributed random variable with mean 0. |
| |
| The condition number [2] \f$ \kappa(X) = \|X\|_2\cdot\|X^{-1}\|_2\f$ is computed |
| as the product of two spectral norms [3]. The spectral norm of a matrix \f$X\f$ |
| is the largest singular value of \f$X\f$ i.e. the square root of the largest |
| eigenvalue of the positive-semidefinite matrix \f$X^{*}X\f$: |
| |
| \f[ |
| \|X\|_2 = \sqrt{\lambda_{\max}\left(X^{*}X\right)}\ , |
| \f] |
| where \f$X^{*}\f$ is the conjugate transpose of \f$X\f$. |
| The condition number of a linear regression problem |
| is a worst-case measure of how sensitive the |
| result is to small perturbations of the input. A large condition number (say, |
| more than 1000) indicates the presence of significant multicollinearity. |
| |
| @anchor literature |
| @literature |
| |
| [1] Cosma Shalizi: Statistics 36-350: Data Mining, Lecture Notes, 21 October |
| 2009, http://www.stat.cmu.edu/~cshalizi/350/lectures/17/lecture-17.pdf |
| |
| [2] Wikipedia: Condition Number, http://en.wikipedia.org/wiki/Condition_number. |
| |
| [3] Wikipedia: Spectral Norm, |
| http://en.wikipedia.org/wiki/Spectral_norm#Spectral_norm |
| |
| [4] Wikipedia: Breusch–Pagan test, |
| http://en.wikipedia.org/wiki/Breusch%E2%80%93Pagan_test |
| |
| [5] Wikipedia: Heteroscedasticity-consistent standard errors, |
| http://en.wikipedia.org/wiki/Heteroscedasticity-consistent_standard_errors |
| |
| |
| @internal |
| @sa Namespace \ref madlib::modules::regress |
| documenting the implementation in C++ |
| @endinternal |
| */ |
| --------------------------------------------------------------------------- |
| CREATE TYPE MADLIB_SCHEMA.linregr_result AS ( |
| coef DOUBLE PRECISION[], |
| r2 DOUBLE PRECISION, |
| std_err DOUBLE PRECISION[], |
| t_stats DOUBLE PRECISION[], |
| p_values DOUBLE PRECISION[], |
| condition_no DOUBLE PRECISION |
| ); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linregr_transition( |
| state MADLIB_SCHEMA.bytea8, |
| y DOUBLE PRECISION, |
| x DOUBLE PRECISION[]) |
| RETURNS MADLIB_SCHEMA.bytea8 |
| AS 'MODULE_PATHNAME' |
| LANGUAGE C |
| IMMUTABLE STRICT; |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linregr_merge_states( |
| state1 MADLIB_SCHEMA.bytea8, |
| state2 MADLIB_SCHEMA.bytea8) |
| RETURNS MADLIB_SCHEMA.bytea8 |
| AS 'MODULE_PATHNAME' |
| LANGUAGE C |
| IMMUTABLE STRICT; |
| |
| -- Final functions |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linregr_final( |
| state MADLIB_SCHEMA.bytea8) |
| RETURNS MADLIB_SCHEMA.linregr_result |
| AS 'MODULE_PATHNAME' |
| LANGUAGE C IMMUTABLE STRICT; |
| |
| |
| CREATE TYPE MADLIB_SCHEMA.heteroskedasticity_test_result AS ( |
| bp_stats DOUBLE PRECISION, |
| bp_p_value DOUBLE PRECISION |
| ); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.hetero_linregr_transition( |
| state MADLIB_SCHEMA.bytea8, |
| y DOUBLE PRECISION, |
| x DOUBLE PRECISION[], |
| coef DOUBLE PRECISION[]) |
| RETURNS MADLIB_SCHEMA.bytea8 |
| AS 'MODULE_PATHNAME' |
| LANGUAGE C |
| IMMUTABLE STRICT; |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.hetero_linregr_merge_states( |
| state1 MADLIB_SCHEMA.bytea8, |
| state2 MADLIB_SCHEMA.bytea8) |
| RETURNS MADLIB_SCHEMA.bytea8 |
| AS 'MODULE_PATHNAME' |
| LANGUAGE C |
| IMMUTABLE STRICT; |
| |
| -- Final functions |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.hetero_linregr_final( |
| state MADLIB_SCHEMA.bytea8) |
| RETURNS MADLIB_SCHEMA.heteroskedasticity_test_result |
| AS 'MODULE_PATHNAME' |
| LANGUAGE C IMMUTABLE STRICT; |
| |
| /** |
| * @brief Compute studentized Breuch-Pagan heteroskedasticity test for |
| * linear regression. |
| * |
| * @param dependentVariable Column containing the dependent variable |
| * @param independentVariables Column containing the array of independent variables |
| * @param olsCoefficients Column containing the array of the OLS coefficients (as obtained by linregr) |
| * |
| * @par |
| * To include an intercept in the model, set one coordinate in the |
| * <tt>independentVariables</tt> array to 1. |
| * |
| * @return A composite value: |
| * - <tt>test_statistic FLOAT8[]</tt> - Prob > test_statistc |
| * - <tt>p_value FLOAT8[]</tt> - Prob > test_statistc |
| * |
| * @usage |
| * <pre> SELECT (heteoskedasticity_test_linregr(<em>dependentVariable</em>, |
| * <em>independentVariables</em>, coef)).* |
| * FROM ( |
| * SELECT linregr(<em>dependentVariable</em>, <em>independentVariables</em>).coef |
| * ) AS ols_coef, <em>sourceName</em> as src; |
| * </pre> |
| */ |
| CREATE AGGREGATE MADLIB_SCHEMA.heteroskedasticity_test_linregr( |
| /*+ "dependentVariable" */ DOUBLE PRECISION, |
| /*+ "independentVariables" */ DOUBLE PRECISION[], |
| /*+ "olsCoefficients" */ DOUBLE PRECISION[]) ( |
| |
| SFUNC=MADLIB_SCHEMA.hetero_linregr_transition, |
| STYPE=MADLIB_SCHEMA.bytea8, |
| FINALFUNC=MADLIB_SCHEMA.hetero_linregr_final, |
| m4_ifdef(`GREENPLUM',`prefunc=MADLIB_SCHEMA.hetero_linregr_merge_states,') |
| INITCOND='' |
| ); |
| |
| /** |
| * @brief Compute linear regression coefficients and diagnostic statistics. |
| * |
| * @param dependentVariable Column containing the dependent variable |
| * @param independentVariables Column containing the array of independent variables |
| * |
| * @par |
| * To include an intercept in the model, set one coordinate in the |
| * <tt>independentVariables</tt> array to 1. |
| * |
| * @return A composite value: |
| * - <tt>coef FLOAT8[]</tt> - Array of coefficients, \f$ \boldsymbol c \f$ |
| * - <tt>r2 FLOAT8</tt> - Coefficient of determination, \f$ R^2 \f$ |
| * - <tt>std_err FLOAT8[]</tt> - Array of standard errors, |
| * \f$ \mathit{se}(c_1), \dots, \mathit{se}(c_k) \f$ |
| * - <tt>t_stats FLOAT8[]</tt> - Array of t-statistics, \f$ \boldsymbol t \f$ |
| * - <tt>p_values FLOAT8[]</tt> - Array of p-values, \f$ \boldsymbol p \f$ |
| * - <tt>condition_no FLOAT8</tt> - The condition number of matrix |
| * \f$ X^T X \f$. |
| * |
| * @usage |
| * - Get vector of coefficients \f$ \boldsymbol c \f$ and all diagnostic |
| * statistics:\n |
| * <pre>SELECT (linregr(<em>dependentVariable</em>, |
| * <em>independentVariables</em>)).* |
| *FROM <em>sourceName</em>;</pre> |
| * - Get vector of coefficients \f$ \boldsymbol c \f$:\n |
| * <pre>SELECT (linregr(<em>dependentVariable</em>, |
| * <em>independentVariables</em>)).coef |
| *FROM <em>sourceName</em>;</pre> |
| * - Get a subset of the output columns, e.g., only the array of coefficients |
| * \f$ \boldsymbol c \f$, the coefficient of determination \f$ R^2 \f$, and |
| * the array of p-values \f$ \boldsymbol p \f$: |
| * <pre>SELECT (lr).coef, (lr).r2, (lr).p_values |
| *FROM ( |
| * SELECT linregr( <em>dependentVariable</em>, |
| * <em>independentVariables</em>) AS lr |
| * FROM <em>sourceName</em> |
| *) AS subq;</pre> |
| */ |
| |
| CREATE AGGREGATE MADLIB_SCHEMA.linregr( |
| /*+ "dependentVariable" */ DOUBLE PRECISION, |
| /*+ "independentVariables" */ DOUBLE PRECISION[]) ( |
| |
| SFUNC=MADLIB_SCHEMA.linregr_transition, |
| STYPE=MADLIB_SCHEMA.bytea8, |
| FINALFUNC=MADLIB_SCHEMA.linregr_final, |
| m4_ifdef(`__GREENPLUM__',`prefunc=MADLIB_SCHEMA.linregr_merge_states,') |
| INITCOND='' |
| ); |
| --------------------------- INTERNAL --------------------------------------- |
| |
| |
| /** |
| * @brief Return heteroskedasticity values for specific linear regression |
| * coefficients |
| **/ |
| CREATE FUNCTION MADLIB_SCHEMA.__internal_get_hsk_result( |
| source_table VARCHAR -- name of input table |
| , dependent_varname VARCHAR -- name of dependent variable |
| , independent_varname VARCHAR -- name of independent variable |
| , linregr_coeffs DOUBLE PRECISION[] -- coeffs from linear regression |
| ) |
| RETURNS MADLIB_SCHEMA.heteroskedasticity_test_result AS $$ |
| DECLARE |
| hsk_value MADLIB_SCHEMA.heteroskedasticity_test_result; |
| BEGIN |
| EXECUTE ' |
| SELECT (MADLIB_SCHEMA.heteroskedasticity_test_linregr(' |
| || dependent_varname || ' , ' |
| || independent_varname || ' , ' |
| || 'ARRAY[' || array_to_string(linregr_coeffs, ',') || '])).* |
| FROM ' || source_table |
| INTO hsk_value; |
| |
| RETURN hsk_value; |
| END |
| $$ LANGUAGE plpgsql VOLATILE; |
| /** |
| * @brief Return linear regression output for source data |
| * |
| **/ |
| CREATE FUNCTION MADLIB_SCHEMA.__internal_get_linreg_result( |
| source_table VARCHAR -- name of input table |
| , dependent_varname VARCHAR -- name of dependent variable |
| , independent_varname VARCHAR -- name of independent variable |
| ) |
| RETURNS MADLIB_SCHEMA.linregr_result AS $$ |
| DECLARE |
| lin_rst MADLIB_SCHEMA.linregr_result; |
| BEGIN |
| EXECUTE ' |
| SELECT (MADLIB_SCHEMA.linregr( ' |
| || dependent_varname || ' , ' |
| || independent_varname || ') |
| ).* |
| FROM ' || source_table |
| INTO lin_rst; |
| RETURN lin_rst; |
| END |
| $$ LANGUAGE plpgsql VOLATILE; |
| |
| |
| CREATE FUNCTION MADLIB_SCHEMA.__internal_get_linregr_insert_string( |
| lin_rst MADLIB_SCHEMA.linregr_result, |
| out_table TEXT |
| ) |
| RETURNS VARCHAR AS $$ |
| DECLARE |
| insert_string VARCHAR; |
| BEGIN |
| insert_string := 'INSERT INTO ' || out_table || ' VALUES ('; |
| insert_string := insert_string || |
| CASE |
| WHEN (lin_rst).coef is NULL |
| THEN '''{}'',' |
| ELSE 'ARRAY[' || array_to_string((lin_rst).coef, ',') || '], ' |
| END || |
| CASE |
| WHEN (lin_rst).r2 is NULL |
| THEN '0.0,' |
| ELSE (lin_rst).r2 || ',' |
| END || |
| CASE |
| WHEN (lin_rst).std_err is NULL |
| THEN '''{}'',' |
| ELSE 'ARRAY[' || array_to_string((lin_rst).std_err, ',') || '], ' |
| END || |
| CASE |
| WHEN (lin_rst).t_stats is NULL |
| THEN '''{}'',' |
| ELSE 'ARRAY[' || array_to_string((lin_rst).t_stats, ',') || '], ' |
| END || |
| CASE |
| WHEN (lin_rst).p_values is NULL |
| THEN '''{}'',' |
| ELSE 'ARRAY[' || array_to_string((lin_rst).p_values, ',') || '], ' |
| END || |
| CASE |
| WHEN (lin_rst).condition_no is NULL |
| THEN '0.0' |
| ELSE (lin_rst).condition_no |
| END; |
| RETURN insert_string; |
| END; |
| $$ LANGUAGE plpgsql VOLATILE; |
| |
| /** |
| * @brief Compute linear regression coefficients and heterskedasticity values |
| * |
| **/ |
| CREATE FUNCTION MADLIB_SCHEMA.__internal_linregr_train_hetero( |
| source_table VARCHAR -- name of input table |
| , out_table VARCHAR -- name of output table |
| , dependent_varname VARCHAR -- name of dependent variable |
| , independent_varname VARCHAR -- name of independent variable |
| , heteroskedasticity_option BOOLEAN -- do you want heteroskedasticity output |
| ) |
| RETURNS VOID AS $$ |
| DECLARE |
| insert_string VARCHAR; |
| lin_rst MADLIB_SCHEMA.linregr_result; |
| hsk_value MADLIB_SCHEMA.heteroskedasticity_test_result; |
| BEGIN |
| IF (source_table IS NULL OR source_table = '') THEN |
| RAISE EXCEPTION 'Invalid input table name given.'; |
| END IF; |
| IF (out_table IS NULL OR out_table = '') THEN |
| RAISE EXCEPTION 'Invalid output table name given.'; |
| END IF; |
| IF (dependent_varname IS NULL OR dependent_varname = '') THEN |
| RAISE EXCEPTION 'Invalid dependent variable name given.'; |
| END IF; |
| IF (independent_varname IS NULL OR independent_varname = '') THEN |
| RAISE EXCEPTION 'Invalid independent variable name given.'; |
| END IF; |
| IF (MADLIB_SCHEMA.__table_exists(out_table)) THEN |
| RAISE EXCEPTION 'Output table name already exists. Drop the table before calling the function.'; |
| END IF; |
| -- create output table with appropriate column names |
| EXECUTE 'DROP TABLE IF EXISTS ' || out_table; |
| EXECUTE ' |
| CREATE TABLE ' || out_table || ' ( |
| coef DOUBLE PRECISION[], |
| r2 DOUBLE PRECISION, |
| std_err DOUBLE PRECISION[], |
| t_stats DOUBLE PRECISION[], |
| p_values DOUBLE PRECISION[], |
| condition_no DOUBLE PRECISION)'; |
| IF heteroskedasticity_option THEN |
| -- Alter output table to add heteroskedasticity values |
| EXECUTE ' |
| ALTER TABLE ' || out_table || ' |
| ADD COLUMN bp_stats DOUBLE PRECISION, |
| ADD COLUMN bp_p_value DOUBLE PRECISION'; |
| END IF; |
| -- compute linear regression and heteroskedasticity values (if required) |
| lin_rst := MADLIB_SCHEMA.__internal_get_linreg_result( |
| source_table, dependent_varname, independent_varname); |
| insert_string := MADLIB_SCHEMA.__internal_get_linregr_insert_string( |
| lin_rst, out_table); |
| -- Ensure Infinity and NaN are cast properly |
| insert_string := REGEXP_REPLACE(insert_string, 'Infinity', |
| '''Infinity''::double precision', 'gi'); |
| insert_string := REGEXP_REPLACE(insert_string, 'NaN', |
| '''NaN''::double precision', 'gi'); |
| IF heteroskedasticity_option THEN |
| -- add hsk values in the sql string and execute |
| hsk_value := MADLIB_SCHEMA.__internal_get_hsk_result( |
| source_table, dependent_varname, |
| independent_varname, (lin_rst).coef); |
| EXECUTE |
| insert_string || ',' |
| || (hsk_value).bp_stats || ',' |
| || (hsk_value).bp_p_value || ')'; |
| ELSE |
| -- complete the sql string and execute |
| EXECUTE insert_string || ')'; |
| END IF; |
| END |
| $$ LANGUAGE plpgsql VOLATILE; |
| --------------------------------------------------------------------------- |
| /** |
| * @brief Compute linear regression coefficients and insert into an output |
| * table. The function drops the table if it already exists before creating |
| * the table. |
| * |
| **/ |
| CREATE FUNCTION MADLIB_SCHEMA.linregr_train( |
| source_table VARCHAR -- name of input table |
| , out_table VARCHAR -- name of output table |
| , dependent_varname VARCHAR -- name of dependent variable |
| , independent_varname VARCHAR -- name of independent variable |
| ) |
| RETURNS VOID AS $$ |
| BEGIN |
| PERFORM MADLIB_SCHEMA.__internal_linregr_train_hetero( |
| source_table, out_table, dependent_varname, independent_varname, False); |
| -- RAISE NOTICE ' |
| -- Finished linear regression |
| -- * table : % (%, %) |
| -- Output: |
| -- * view : SELECT * FROM % ;', source_table, dependent_varname, |
| -- independent_varname, out_table; |
| END; |
| $$ LANGUAGE plpgsql VOLATILE; |
| |
| --------------------- GROUPING --------------------------------------------- |
| /** |
| * @brief Linear regression training function with grouping support and |
| * option for heteroskedasticity values. |
| **/ |
| CREATE FUNCTION MADLIB_SCHEMA.linregr_train( |
| source_table VARCHAR -- name of input table |
| , out_table VARCHAR -- name of output table |
| , dependent_varname VARCHAR -- name of dependent variable |
| , independent_varname VARCHAR -- name of independent variable |
| , input_group_cols VARCHAR -- names of columns to group-by |
| , heteroskedasticity_option BOOLEAN -- heteroskedasticity |
| ) |
| RETURNS VOID AS $$ |
| DECLARE |
| input_table_name VARCHAR[]; |
| group_cols VARCHAR[]; |
| actual_table_name VARCHAR; |
| schema_name VARCHAR; |
| table_creation_string VARCHAR; |
| group_string VARCHAR; |
| group_array_length INTEGER; |
| col_data_type VARCHAR; |
| each_group INTEGER; |
| linregr_fitting_rst VARCHAR; |
| old_msg_level TEXT; |
| BEGIN |
| |
| EXECUTE 'SELECT setting FROM pg_settings WHERE name=''client_min_messages''' INTO old_msg_level; |
| EXECUTE 'SET client_min_messages TO warning'; |
| |
| IF (source_table IS NULL OR source_table = '') THEN |
| RAISE EXCEPTION 'Invalid input table name given.'; |
| END IF; |
| IF (out_table IS NULL OR out_table = '') THEN |
| RAISE EXCEPTION 'Invalid output table name given.'; |
| END IF; |
| IF (dependent_varname IS NULL OR dependent_varname = '') THEN |
| RAISE EXCEPTION 'Invalid dependent variable name given.'; |
| END IF; |
| IF (independent_varname IS NULL OR independent_varname = '') THEN |
| RAISE EXCEPTION 'Invalid independent variable name given.'; |
| END IF; |
| |
| IF (NOT MADLIB_SCHEMA.__table_exists(source_table)) THEN |
| RAISE EXCEPTION 'Source table name does not exist.'; |
| END IF; |
| |
| IF (MADLIB_SCHEMA.__table_exists(out_table)) THEN |
| RAISE EXCEPTION 'Output table name already exists. Drop the table before calling the function.'; |
| END IF; |
| |
| |
| -- initial validation |
| IF (input_group_cols IS NULL) |
| --OR array_upper(input_group_cols, 1) IS NULL |
| --OR array_upper(input_group_cols, 1) = 0) |
| THEN |
| PERFORM MADLIB_SCHEMA.__internal_linregr_train_hetero( |
| source_table, out_table, |
| dependent_varname, independent_varname, |
| heteroskedasticity_option); |
| ELSE |
| group_cols = MADLIB_SCHEMA._string_to_array(input_group_cols); |
| -- create output table |
| EXECUTE 'DROP TABLE IF EXISTS ' || out_table; |
| table_creation_string := 'CREATE TABLE ' || out_table || '('; |
| group_array_length = array_upper(group_cols, 1); |
| |
| input_table_name = regexp_split_to_array(source_table, E'\\.'); |
| IF array_upper(input_table_name, 1) = 1 THEN |
| actual_table_name = input_table_name[1]; |
| schema_name := current_schema(); |
| ELSIF array_upper(input_table_name, 1) = 2 THEN |
| actual_table_name = input_table_name[2]; |
| schema_name = input_table_name[1]; |
| ELSE |
| RAISE EXCEPTION 'Incorrect input source table name provided'; |
| END IF; |
| |
| -- Check that each grouping column exists |
| FOR each_group in 1 .. group_array_length |
| LOOP |
| if NOT MADLIB_SCHEMA.check_if_col_exists(source_table, |
| group_cols[each_group]) THEN |
| RAISE EXCEPTION 'Grouping column % does not exist', |
| group_cols[each_group]; |
| END IF; |
| |
| |
| END LOOP; |
| |
| FOR each_group in 1 .. group_array_length |
| LOOP |
| -- create a string that makes list of |
| EXECUTE 'SELECT data_type FROM information_schema.columns |
| WHERE |
| table_schema = ''' || schema_name || ''' |
| AND table_name = ''' || actual_table_name || ''' |
| AND column_name= ''' || group_cols[each_group] || '''' |
| INTO col_data_type; |
| |
| table_creation_string := table_creation_string |
| || group_cols[each_group] |
| || ' ' || col_data_type || ','; |
| END LOOP; |
| |
| -- finish creating the output table |
| EXECUTE table_creation_string || ' |
| coef DOUBLE PRECISION[], |
| r2 DOUBLE PRECISION, |
| std_err DOUBLE PRECISION[], |
| t_stats DOUBLE PRECISION[], |
| p_values DOUBLE PRECISION[], |
| condition_no DOUBLE PRECISION)'; |
| IF heteroskedasticity_option THEN |
| EXECUTE 'ALTER TABLE ' || out_table || ' |
| ADD COLUMN bp_stats DOUBLE PRECISION, |
| ADD COLUMN bp_p_value DOUBLE PRECISION'; |
| END IF; |
| |
| group_string := ''; |
| FOR each_group in 1 .. (group_array_length-1) |
| LOOP |
| group_string := group_string || group_cols[each_group] || ','; |
| END LOOP; |
| group_string := group_string || group_cols[group_array_length]; |
| |
| IF heteroskedasticity_option THEN |
| linregr_fitting_rst := MADLIB_SCHEMA.__unique_string(); |
| EXECUTE ' |
| DROP TABLE IF EXISTS '|| linregr_fitting_rst ||'; |
| CREATE TEMP TABLE '|| linregr_fitting_rst ||' AS |
| SELECT |
| '|| group_string ||', |
| (MADLIB_SCHEMA.linregr('|| dependent_varname ||','|| independent_varname ||')).* |
| FROM '|| source_table ||' |
| GROUP BY '|| group_string; |
| |
| EXECUTE ' |
| INSERT INTO ' || out_table || ' |
| SELECT * |
| FROM |
| '|| linregr_fitting_rst ||' |
| JOIN ( |
| SELECT |
| '|| group_string ||', |
| (MADLIB_SCHEMA.heteroskedasticity_test_linregr(' |
| || dependent_varname || ',' |
| || independent_varname || ', t.coef)).* |
| FROM |
| '|| source_table ||' AS s |
| JOIN |
| '|| linregr_fitting_rst ||' AS t |
| USING (' || group_string || ') |
| GROUP BY ' || group_string ||') z |
| USING ('|| group_string ||')'; |
| |
| EXECUTE 'DROP TABLE IF EXISTS '|| linregr_fitting_rst; |
| ELSE |
| EXECUTE ' |
| INSERT INTO ' || out_table || ' |
| SELECT ' || group_string || ', (result).coef, (result).r2, |
| (result).std_err, (result).t_stats, |
| (result).p_values, (result).condition_no |
| FROM ( |
| SELECT ' || group_string || |
| ', MADLIB_SCHEMA.linregr( ' |
| || dependent_varname || ' , ' |
| || independent_varname || ' ) |
| AS result |
| FROM ' || source_table || ' |
| GROUP BY ' || group_string || |
| ') subq'; |
| END IF; |
| END IF; |
| |
| EXECUTE 'SET client_min_messages TO '|| old_msg_level; |
| END; |
| $$ LANGUAGE plpgsql VOLATILE; |
| --------------------------------------------------------------------------- |
| /** |
| * @brief Linear regression training function with grouping support. |
| **/ |
| CREATE FUNCTION MADLIB_SCHEMA.linregr_train( |
| source_table VARCHAR -- name of input table |
| , out_table VARCHAR -- name of output table |
| , dependent_varname VARCHAR -- name of dependent variable |
| , independent_varname VARCHAR -- name of independent variable |
| , group_cols VARCHAR -- names of columns to group-by |
| ) |
| RETURNS VOID AS $$ |
| BEGIN |
| PERFORM MADLIB_SCHEMA.linregr_train( source_table, out_table, |
| dependent_varname, |
| independent_varname, group_cols, FALSE); |
| END; |
| $$ LANGUAGE plpgsql VOLATILE; |
| --------------------------------------------------------------------------- |
| |