src/ports/postgres/modules/regress/linear.sql_in - madlib - Git at Google

 /* ----------------------------------------------------------------------- *//**
  *
  * @file linear.sql_in
  *
  * @brief SQL functions for linear regression
  * @date January 2011
  *
  * @sa For a brief introduction to linear regression, see the module
  *     description \ref grp_linreg.
  *
  *//* ----------------------------------------------------------------------- */

 m4_include(`SQLCommon.m4')

 /**
 @addtogroup grp_linreg

 <div class="toc"><b>Contents</b><ul>
 <li class="level1"><a href="#about">About</a></li>
 <li class="level1"><a href="#train">Usage</a></li>
 <li class="level2"><a href="#train">Training Function</a></li>
 <li class="level2"><a href="#output">Output Table</a></li>
 <li class="level2"><a href="#predict">Prediction Function</a></li>
 <li class="level1"><a href="#examples">Examples</a></li>
 <li class="level1"><a href="#seealso">See Also</a></li>
 <li class="level1"><a href="#background">Technical Background</a></li>
 <li class="level1"><a href="#literature">Literature</a></li>
 </ul></div>

 @anchor about
 @about
 Ordinary Least Squares Regression, also called Linear Regression, is a
 statistical model used to fit linear models.

 It models a linear relationship of a scalar dependent variable \f$ y \f$ to one
 or more explanatory independent variables \f$ x \f$ to build a
 model of coefficients.


 @anchor train
 @par Training Function
 @verbatim
 linregr_train(source_table, out_table,
               dependent_varname,
               independent_varname,
               input_group_cols := NULL,
               heteroskedasticity_option := NULL)
 @endverbatim

 <DL class="arglist">
 <DT>source_table</DT>
 <DD>Text value. The name of the table containing the training data.</DD>

 <DT>out_table</DT>
 <DD>Text value. Name of the generated table containing the output model.</DD>

 <DT>dependent_varname</DT>
 <DD>Text value. Expression to evaluate for the dependent variable.</DD>

 <DT>independent_varname</DT>
 <DD>Text value. Expression list to evaluate for the independent variables. An intercept variable is not assumed. It is common to provide an explicit intercept term by including a single constant <tt>1</tt> term in the independent variable list.</DD>

 <DT>input_group_cols</DT>
 <DD>Text value. An expression list used to group the input dataset into discrete groups, running one regression per group. Similar to the SQL <tt>GROUP BY</tt> clause. When this value is null, no grouping is used and a single result model is generated. Default value: NULL.</DD>

 <DT>heteroskedasticity_option</DT>
 <DD>Boolean value. When True, the heteroskedacity of the model is also calculated and returned with the results. Default value: False.</DD>
 </DL>

 @anchor output
 @par Output Table
 The output table produced by the linear regression training function contains the following columns.
 <DL class="arglist">
 <DT>\<...></DT>
 <DD>Any grouping columns provided during training.
 Present only if the grouping option is used.</DD>

 <DT>coef</DT>
 <DD>Float array. Vector of the coefficients of the regression.</DD>

 <DT>r2</DT>
 <DD>Float. R-squared coefficient of determination of the model.</DD>

 <DT>std_err</DT>
 <DD>Float array. Vector of the standard error of the coefficients.</DD>

 <DT>t_stats</DT>
 <DD>Float array. Vector of the t-statistics of the coefficients.</DD>

 <DT>p_values</DT>
 <DD>Float array. Vector of the p-values of the coefficients.</DD>

 <DT>condition_no</DT>
 <DD>Float array. The condition number of the \f$X^{*}X\f$
 matrix. A high condition number is usually an indication that there may be
 some numeric instability in the result yielding a less reliable model. A high
 condition number often results when there is a significant amount of
 colinearity in the underlying design matrix, in which case other regression
 techniques, such as elastic net regression, may be more appropriate.</DD>

 <DT>bp_stats</DT>
 <DD>Float. The Breush-Pagan statistic of heteroskedacity. Present
 only if the heteroskedacity argument was set to True when the model was
 trained.</DD>

 <DT>bp_p_value</DT>
 <DD>Float. The Breush-Pagan calculated p-value. Present only if
 the heteroskedacity parameter was set to True when the model was
 trained.</DD>
 </DL>

 @anchor predict
 @par Prediction Function
 @verbatim
 linregr_predict(
   coef,
   col_ind
 )
 @endverbatim
 <dl class="arglist">
 <dt>coef</dt>
 <dd>Float array. Vector of the coefficients of regression.</dd>

 <dt>col_ind</dt>
 <dd>Float array. An array containing the independent variable column names. </dd>

 @anchor examples
 @par Examples
 -#  Create an input data set.
 @verbatim
 CREATE TABLE houses (id INT, tax INT, bedroom INT, bath FLOAT, price INT,
             size INT, lot INT);
 COPY houses FROM STDIN WITH DELIMITER '|';
   1 |  590 |       2 |    1 |  50000 |  770 | 22100
   2 | 1050 |       3 |    2 |  85000 | 1410 | 12000
   3 |   20 |       3 |    1 |  22500 | 1060 |  3500
   4 |  870 |       2 |    2 |  90000 | 1300 | 17500
   5 | 1320 |       3 |    2 | 133000 | 1500 | 30000
   6 | 1350 |       2 |    1 |  90500 |  820 | 25700
   7 | 2790 |       3 |  2.5 | 260000 | 2130 | 25000
   8 |  680 |       2 |    1 | 142500 | 1170 | 22000
   9 | 1840 |       3 |    2 | 160000 | 1500 | 19000
  10 | 3680 |       4 |    2 | 240000 | 2790 | 20000
  11 | 1660 |       3 |    1 |  87000 | 1030 | 17500
  12 | 1620 |       3 |    2 | 118600 | 1250 | 20000
  13 | 3100 |       3 |    2 | 140000 | 1760 | 38000
  14 | 2070 |       2 |    3 | 148000 | 1550 | 14000
  15 |  650 |       3 |  1.5 |  65000 | 1450 | 12000
 \.
 @endverbatim
 -#  Train a regression model.
 @verbatim
 -- A single regression for all the data
 SELECT madlib.linregr_train(
     'houses', 'houses_linregr', 'price', 'array[1, tax, bath, size]');

 -- 3 output models, one for each value of "bedroom"
 SELECT madlib.linregr_train(
     'houses', 'houses_linregr_bedroom', 'price', 'array[1, tax, bath, size]',
     'bedroom');
 @endverbatim
 -# Examine the resulting models.
 @verbatim
 -- Set extended display on for easier reading of output
 \x on
 SELECT * from houses_linregr;
 SELECT * FROM houses_linregr_bedroom;

 -- Alternatively you can unnest the results for easier reading of output
 \x off
 SELECT unnest(array['intercept','tax','bath','size']) as attribute,
        unnest(coef) as coefficient,
        unnest(std_err) as standard_error,
        unnest(t_stats) as t_stat,
        unnest(p_values) as pvalue
 FROM houses_linregr;
 @endverbatim
 -# Use the prediction function to evaluate residuals.
 @verbatim
 SELECT houses.*,
        madlib.linregr_predict(array[1,tax,bath,size], m.coef) as predict,
          price - madlib.linregr_predict(array[1,tax,bath,size], m.coef)
             as residual
 FROM houses, houses_linregr m;
 @endverbatim

 @anchor seealso
 @sa File linear.sql_in, documenting the SQL training functions
 @sa linregr()
 @sa logregr_train()
 @sa elastic_net_train()
 @sa grp_robust
 @sa grp_clustered_errors
 @sa grp_validation

 @anchor background
 @par Technical Background

 Ordinary least-squares (OLS) linear regression refers to a stochastic model in
 which the conditional mean of the dependent variable (usually denoted \f$ Y \f$)
 is an affine function of the vector of independent variables (usually denoted
 \f$ \boldsymbol x \f$). That is,
 \f[
     E[Y \mid \boldsymbol x] = \boldsymbol c^T \boldsymbol x
 \f]
 for some unknown vector of coefficients \f$ \boldsymbol c \f$. The assumption is
 that the residuals are i.i.d. distributed Gaussians. That is, the (conditional)
 probability density of \f$ Y \f$ is given by
 \f[
     f(y \mid \boldsymbol x)
     =   \frac{1}{\sqrt{2 \pi \sigma^2}}
         \cdot \exp\left(-\frac{1}{2 \sigma^2}
             \cdot (y - \boldsymbol x^T \boldsymbol c)^2 \right)
     \,.
 \f]
 OLS linear regression finds the vector of coefficients \f$ \boldsymbol c \f$
 that maximizes the likelihood of the observations.

 Let
 - \f$ \boldsymbol y \in \mathbf R^n \f$ denote the vector of observed dependent
     variables, with \f$ n \f$ rows, containing the observed values of the
     dependent variable,
 - \f$ X \in \mathbf R^{n \times k} \f$ denote the design matrix with \f$ k \f$
   columns and \f$ n \f$ rows, containing all observed vectors of independent
   variables.
   \f$ \boldsymbol x_i \f$ as rows,
 - \f$ X^T \f$ denote the transpose of \f$ X \f$,
 - \f$ X^+ \f$ denote the pseudo-inverse of \f$ X \f$.

 Maximizing the likelihood is equivalent to maximizing the log-likelihood
 \f$ \sum_{i=1}^n \log f(y_i \mid \boldsymbol x_i) \f$, which simplifies to
 minimizing the <b>residual sum of squares</b> \f$ RSS \f$ (also called sum of
 squared residuals or sum of squared errors of prediction),
 \f[
     RSS = \sum_{i=1}^n ( y_i - \boldsymbol c^T \boldsymbol x_i )^2
         = (\boldsymbol y - X \boldsymbol c)^T (\boldsymbol y - X \boldsymbol c)
     \,.
 \f]
 The first-order conditions yield that the \f$ RSS \f$ is minimized at
 \f[
     \boldsymbol c = (X^T X)^+ X^T \boldsymbol y
     \,.
 \f]

 Computing the <b>total sum of squares</b> \f$ TSS \f$, the <b>explained
 sum of squares</b> \f$ ESS \f$ (also called the regression sum of
 squares), and the <b>coefficient of determination</b> \f$ R^2 \f$ is
 done according to the following formulas:
 \f{align*}{
     ESS & = \boldsymbol y^T X \boldsymbol c
         -   \frac{ \| y \|_1^2 }{n} \\
     TSS & = \sum_{i=1}^n y_i^2
         -   \frac{ \| y \|_1^2 }{n} \\
     R^2 & = \frac{ESS}{TSS}
 \f}
 Note: The last equality follows from the definition
 \f$ R^2 = 1 - \frac{RSS}{TSS} \f$ and the fact that for linear regression
 \f$ TSS = RSS + ESS \f$. A proof of the latter can be found, e.g., at:
 http://en.wikipedia.org/wiki/Sum_of_squares

 We estimate the variance
 \f$ Var[Y - \boldsymbol c^T \boldsymbol x \mid \boldsymbol x] \f$ as
 \f[
     \sigma^2 = \frac{RSS}{n - k}
 \f]
 and compute the t-statistic for coefficient \f$ i \f$ as
 \f[
     t_i = \frac{c_i}{\sqrt{\sigma^2 \cdot \left( (X^T X)^{-1} \right)_{ii} }}
     \,.
 \f]

 The \f$ p \f$-value for coefficient \f$ i \f$ gives the probability of seeing a
 value at least as extreme as the one observed, provided that the null hypothesis
 (\f$ c_i = 0 \f$) is true. Letting \f$ F_\nu \f$ denote the
 cumulative density function of student-t with \f$ \nu \f$ degrees of freedom,
 the \f$ p \f$-value for coefficient \f$ i \f$
 is therefore
 \f[
     p_i = \Pr(|T| \geq |t_i|) = 2 \cdot (1 - F_{n - k}( |t_i| ))
 \f]
 where \f$ T \f$ is a student-t distributed random variable with mean 0.

 The condition number [2] \f$ \kappa(X) = \|X\|_2\cdot\|X^{-1}\|_2\f$ is computed
 as the product of two spectral norms [3]. The spectral norm of a matrix \f$X\f$
 is the largest singular value of \f$X\f$ i.e. the square root of the largest
 eigenvalue of the positive-semidefinite matrix \f$X^{*}X\f$:

 \f[
     \|X\|_2 = \sqrt{\lambda_{\max}\left(X^{*}X\right)}\ ,
 \f]
 where \f$X^{*}\f$ is the conjugate transpose of \f$X\f$.
 The condition number of a linear regression problem
 is a worst-case measure of how sensitive the
 result is to small perturbations of the input. A large condition number (say,
 more than 1000) indicates the presence of significant multicollinearity.

 @anchor literature
 @literature

 [1] Cosma Shalizi: Statistics 36-350: Data Mining, Lecture Notes, 21 October
     2009, http://www.stat.cmu.edu/~cshalizi/350/lectures/17/lecture-17.pdf

 [2] Wikipedia: Condition Number, http://en.wikipedia.org/wiki/Condition_number.

 [3] Wikipedia: Spectral Norm,
     http://en.wikipedia.org/wiki/Spectral_norm#Spectral_norm

 [4] Wikipedia: Breusch–Pagan test,
     http://en.wikipedia.org/wiki/Breusch%E2%80%93Pagan_test

 [5] Wikipedia: Heteroscedasticity-consistent standard errors,
 http://en.wikipedia.org/wiki/Heteroscedasticity-consistent_standard_errors


 @internal
 @sa Namespace \ref madlib::modules::regress
     documenting the implementation in C++
 @endinternal
 */
 ---------------------------------------------------------------------------
 CREATE TYPE MADLIB_SCHEMA.linregr_result AS (
     coef DOUBLE PRECISION[],
     r2 DOUBLE PRECISION,
     std_err DOUBLE PRECISION[],
     t_stats DOUBLE PRECISION[],
     p_values DOUBLE PRECISION[],
     condition_no DOUBLE PRECISION
 );

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linregr_transition(
     state MADLIB_SCHEMA.bytea8,
     y DOUBLE PRECISION,
     x DOUBLE PRECISION[])
 RETURNS MADLIB_SCHEMA.bytea8
 AS 'MODULE_PATHNAME'
 LANGUAGE C
 IMMUTABLE STRICT;

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linregr_merge_states(
     state1 MADLIB_SCHEMA.bytea8,
     state2 MADLIB_SCHEMA.bytea8)
 RETURNS MADLIB_SCHEMA.bytea8
 AS 'MODULE_PATHNAME'
 LANGUAGE C
 IMMUTABLE STRICT;

 -- Final functions
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linregr_final(
     state MADLIB_SCHEMA.bytea8)
 RETURNS MADLIB_SCHEMA.linregr_result
 AS 'MODULE_PATHNAME'
 LANGUAGE C IMMUTABLE STRICT;


 CREATE TYPE MADLIB_SCHEMA.heteroskedasticity_test_result AS (
     bp_stats DOUBLE PRECISION,
     bp_p_value DOUBLE PRECISION
 );

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.hetero_linregr_transition(
     state MADLIB_SCHEMA.bytea8,
     y DOUBLE PRECISION,
     x DOUBLE PRECISION[],
     coef DOUBLE PRECISION[])
 RETURNS MADLIB_SCHEMA.bytea8
 AS 'MODULE_PATHNAME'
 LANGUAGE C
 IMMUTABLE STRICT;

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.hetero_linregr_merge_states(
     state1 MADLIB_SCHEMA.bytea8,
     state2 MADLIB_SCHEMA.bytea8)
 RETURNS MADLIB_SCHEMA.bytea8
 AS 'MODULE_PATHNAME'
 LANGUAGE C
 IMMUTABLE STRICT;

 -- Final functions
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.hetero_linregr_final(
     state MADLIB_SCHEMA.bytea8)
 RETURNS MADLIB_SCHEMA.heteroskedasticity_test_result
 AS 'MODULE_PATHNAME'
 LANGUAGE C IMMUTABLE STRICT;

 /**
  * @brief Compute studentized Breuch-Pagan heteroskedasticity test for
  * linear regression.
  *
  * @param dependentVariable Column containing the dependent variable
  * @param independentVariables Column containing the array of independent variables
  * @param olsCoefficients Column containing the array of the OLS coefficients (as obtained by linregr)
  *
  * @par
  * To include an intercept in the model, set one coordinate in the
  * <tt>independentVariables</tt> array to 1.
  *
  * @return A composite value:
  *  - <tt>test_statistic FLOAT8[]</tt> - Prob > test_statistc
  *  - <tt>p_value FLOAT8[]</tt> - Prob > test_statistc
  *
  * @usage
  *  <pre> SELECT (heteoskedasticity_test_linregr(<em>dependentVariable</em>,
  *  <em>independentVariables</em>, coef)).*
  *  FROM (
  *    SELECT linregr(<em>dependentVariable</em>, <em>independentVariables</em>).coef
  *  ) AS ols_coef, <em>sourceName</em> as src;
  * </pre>
  */
 CREATE AGGREGATE MADLIB_SCHEMA.heteroskedasticity_test_linregr(
     /*+ "dependentVariable" */ DOUBLE PRECISION,
     /*+ "independentVariables" */ DOUBLE PRECISION[],
     /*+ "olsCoefficients" */ DOUBLE PRECISION[]) (

     SFUNC=MADLIB_SCHEMA.hetero_linregr_transition,
     STYPE=MADLIB_SCHEMA.bytea8,
     FINALFUNC=MADLIB_SCHEMA.hetero_linregr_final,
     m4_ifdef(`GREENPLUM',`prefunc=MADLIB_SCHEMA.hetero_linregr_merge_states,')
     INITCOND=''
 );

 /**
  * @brief Compute linear regression coefficients and diagnostic statistics.
  *
  * @param dependentVariable Column containing the dependent variable
  * @param independentVariables Column containing the array of independent variables
  *
  * @par
  * To include an intercept in the model, set one coordinate in the
  * <tt>independentVariables</tt> array to 1.
  *
  * @return A composite value:
  *  - <tt>coef FLOAT8[]</tt> - Array of coefficients, \f$ \boldsymbol c \f$
  *  - <tt>r2 FLOAT8</tt> - Coefficient of determination, \f$ R^2 \f$
  *  - <tt>std_err FLOAT8[]</tt> - Array of standard errors,
  *    \f$ \mathit{se}(c_1), \dots, \mathit{se}(c_k) \f$
  *  - <tt>t_stats FLOAT8[]</tt> - Array of t-statistics, \f$ \boldsymbol t \f$
  *  - <tt>p_values FLOAT8[]</tt> - Array of p-values, \f$ \boldsymbol p \f$
  *  - <tt>condition_no FLOAT8</tt> - The condition number of matrix
  *    \f$ X^T X \f$.
  *
  * @usage
  *  - Get vector of coefficients \f$ \boldsymbol c \f$ and all diagnostic
  *    statistics:\n
  *    <pre>SELECT (linregr(<em>dependentVariable</em>,
  *        <em>independentVariables</em>)).*
  *FROM <em>sourceName</em>;</pre>
  *  - Get vector of coefficients \f$ \boldsymbol c \f$:\n
  *    <pre>SELECT (linregr(<em>dependentVariable</em>,
  *        <em>independentVariables</em>)).coef
  *FROM <em>sourceName</em>;</pre>
  *  - Get a subset of the output columns, e.g., only the array of coefficients
  *    \f$ \boldsymbol c \f$, the coefficient of determination \f$ R^2 \f$, and
  *    the array of p-values \f$ \boldsymbol p \f$:
  *    <pre>SELECT (lr).coef, (lr).r2, (lr).p_values
  *FROM (
  *    SELECT linregr( <em>dependentVariable</em>,
  *                    <em>independentVariables</em>) AS lr
  *    FROM <em>sourceName</em>
  *) AS subq;</pre>
  */

 CREATE AGGREGATE MADLIB_SCHEMA.linregr(
     /*+ "dependentVariable" */ DOUBLE PRECISION,
     /*+ "independentVariables" */ DOUBLE PRECISION[]) (

     SFUNC=MADLIB_SCHEMA.linregr_transition,
     STYPE=MADLIB_SCHEMA.bytea8,
     FINALFUNC=MADLIB_SCHEMA.linregr_final,
     m4_ifdef(`__GREENPLUM__',`prefunc=MADLIB_SCHEMA.linregr_merge_states,')
     INITCOND=''
 );
 --------------------------- INTERNAL ---------------------------------------


 /**
   * @brief Return heteroskedasticity values for specific linear regression
   * coefficients
 **/
 CREATE FUNCTION MADLIB_SCHEMA.__internal_get_hsk_result(
      source_table         VARCHAR       -- name of input  table
    , dependent_varname    VARCHAR       -- name of dependent variable
    , independent_varname  VARCHAR       -- name of independent variable
    , linregr_coeffs       DOUBLE PRECISION[]  -- coeffs from linear regression
 )
 RETURNS MADLIB_SCHEMA.heteroskedasticity_test_result AS $$
 DECLARE
 hsk_value MADLIB_SCHEMA.heteroskedasticity_test_result;
 BEGIN
   EXECUTE '
     SELECT (MADLIB_SCHEMA.heteroskedasticity_test_linregr('
           || dependent_varname    || ' , '
           || independent_varname  || ' , '
           || 'ARRAY[' || array_to_string(linregr_coeffs, ',') || '])).*
     FROM ' || source_table
   INTO hsk_value;

   RETURN hsk_value;
 END
 $$ LANGUAGE plpgsql VOLATILE;
 /**
   * @brief Return linear regression output for source data
   *
 **/
 CREATE FUNCTION MADLIB_SCHEMA.__internal_get_linreg_result(
      source_table         VARCHAR       -- name of input  table
    , dependent_varname    VARCHAR       -- name of dependent variable
    , independent_varname  VARCHAR       -- name of independent variable
 )
 RETURNS MADLIB_SCHEMA.linregr_result AS $$
 DECLARE
 lin_rst MADLIB_SCHEMA.linregr_result;
 BEGIN
   EXECUTE '
         SELECT (MADLIB_SCHEMA.linregr( '
                 || dependent_varname    || ' , '
                 || independent_varname  || ')
               ).*
         FROM ' || source_table
   INTO lin_rst;
   RETURN lin_rst;
 END
 $$ LANGUAGE plpgsql VOLATILE;


 CREATE FUNCTION MADLIB_SCHEMA.__internal_get_linregr_insert_string(
     lin_rst MADLIB_SCHEMA.linregr_result,
     out_table TEXT
 )
 RETURNS VARCHAR AS $$
 DECLARE
   insert_string VARCHAR;
 BEGIN
   insert_string := 'INSERT INTO ' || out_table || ' VALUES (';
   insert_string := insert_string  ||
             CASE
               WHEN (lin_rst).coef is NULL
               THEN '''{}'','
               ELSE 'ARRAY[' || array_to_string((lin_rst).coef, ',')     || '], '
             END             ||
             CASE
               WHEN (lin_rst).r2 is NULL
               THEN '0.0,'
               ELSE (lin_rst).r2 || ','
             END             ||
             CASE
               WHEN (lin_rst).std_err is NULL
               THEN '''{}'','
               ELSE 'ARRAY[' || array_to_string((lin_rst).std_err, ',')  || '], '
             END             ||
             CASE
               WHEN (lin_rst).t_stats is NULL
               THEN '''{}'','
               ELSE 'ARRAY[' || array_to_string((lin_rst).t_stats, ',')  || '], '
             END             ||
             CASE
               WHEN (lin_rst).p_values is NULL
               THEN '''{}'','
               ELSE 'ARRAY[' || array_to_string((lin_rst).p_values, ',') || '], '
             END             ||
             CASE
               WHEN (lin_rst).condition_no is NULL
               THEN '0.0'
               ELSE (lin_rst).condition_no
             END;
   RETURN insert_string;
 END;
 $$ LANGUAGE plpgsql VOLATILE;

 /**
  * @brief Compute linear regression coefficients and heterskedasticity values
  *
  **/
  CREATE FUNCTION MADLIB_SCHEMA.__internal_linregr_train_hetero(
      source_table         VARCHAR       -- name of input  table
    , out_table            VARCHAR       -- name of output table
    , dependent_varname    VARCHAR       -- name of dependent variable
    , independent_varname  VARCHAR       -- name of independent variable
    , heteroskedasticity_option  BOOLEAN -- do you want heteroskedasticity output
 )
 RETURNS VOID AS $$
 DECLARE
 insert_string           VARCHAR;
 lin_rst                 MADLIB_SCHEMA.linregr_result;
 hsk_value               MADLIB_SCHEMA.heteroskedasticity_test_result;
 BEGIN
     IF (source_table IS NULL OR source_table = '') THEN
         RAISE EXCEPTION 'Invalid input table name given.';
     END IF;
     IF (out_table IS NULL OR out_table = '') THEN
         RAISE EXCEPTION 'Invalid output table name given.';
     END IF;
     IF (dependent_varname IS NULL OR dependent_varname = '') THEN
         RAISE EXCEPTION 'Invalid dependent variable name given.';
     END IF;
     IF (independent_varname IS NULL OR independent_varname = '') THEN
         RAISE EXCEPTION 'Invalid independent variable name given.';
     END IF;
 	  IF (MADLIB_SCHEMA.__table_exists(out_table)) THEN
 	    RAISE EXCEPTION 'Output table name already exists. Drop the table before calling the function.';
     END IF;
     -- create output table with appropriate column names
     EXECUTE 'DROP TABLE IF EXISTS ' || out_table;
     EXECUTE '
             CREATE TABLE ' || out_table || ' (
                 coef DOUBLE PRECISION[],
                 r2 DOUBLE PRECISION,
                 std_err DOUBLE PRECISION[],
                 t_stats DOUBLE PRECISION[],
                 p_values DOUBLE PRECISION[],
                 condition_no DOUBLE PRECISION)';
     IF heteroskedasticity_option THEN
       -- Alter output table to add heteroskedasticity values
       EXECUTE '
             ALTER TABLE ' || out_table || '
                 ADD COLUMN bp_stats DOUBLE PRECISION,
                 ADD COLUMN bp_p_value DOUBLE PRECISION';
     END IF;
     -- compute linear regression and heteroskedasticity values (if required)
     lin_rst := MADLIB_SCHEMA.__internal_get_linreg_result(
                     source_table, dependent_varname, independent_varname);
     insert_string := MADLIB_SCHEMA.__internal_get_linregr_insert_string(
                     lin_rst, out_table);
     -- Ensure Infinity and NaN are cast properly
     insert_string := REGEXP_REPLACE(insert_string, 'Infinity',
                                     '''Infinity''::double precision', 'gi');
     insert_string := REGEXP_REPLACE(insert_string, 'NaN',
                                     '''NaN''::double precision', 'gi');
     IF heteroskedasticity_option THEN
       -- add hsk values in the sql string and execute
       hsk_value := MADLIB_SCHEMA.__internal_get_hsk_result(
                     source_table, dependent_varname,
                     independent_varname, (lin_rst).coef);
       EXECUTE
           insert_string             || ','
           || (hsk_value).bp_stats   || ','
           || (hsk_value).bp_p_value || ')';
     ELSE
       -- complete the sql string and execute
       EXECUTE insert_string || ')';
     END IF;
 END
 $$ LANGUAGE plpgsql VOLATILE;
 ---------------------------------------------------------------------------
 /**
   * @brief Compute linear regression coefficients and insert into an output
   * table. The function drops the table if it already exists before creating
   * the table.
   *
 **/
 CREATE FUNCTION MADLIB_SCHEMA.linregr_train(
      source_table         VARCHAR       -- name of input  table
    , out_table            VARCHAR       -- name of output table
    , dependent_varname    VARCHAR       -- name of dependent variable
    , independent_varname  VARCHAR       -- name of independent variable
    )
 RETURNS VOID AS $$
 BEGIN
   PERFORM MADLIB_SCHEMA.__internal_linregr_train_hetero(
         source_table, out_table, dependent_varname, independent_varname, False);
   -- RAISE NOTICE '
   -- Finished linear regression
   --   * table : % (%, %)
   -- Output:
   --   * view : SELECT * FROM % ;', source_table, dependent_varname,
   --                                 independent_varname, out_table;
 END;
 $$ LANGUAGE plpgsql VOLATILE;

 --------------------- GROUPING ---------------------------------------------
 /**
   * @brief Linear regression training function with grouping support and
   * option for heteroskedasticity values.
  **/
 CREATE FUNCTION MADLIB_SCHEMA.linregr_train(
      source_table               VARCHAR       -- name of input  table
    , out_table                  VARCHAR       -- name of output table
    , dependent_varname          VARCHAR       -- name of dependent variable
    , independent_varname        VARCHAR       -- name of independent variable
    , input_group_cols           VARCHAR       -- names of columns to group-by
    , heteroskedasticity_option  BOOLEAN -- heteroskedasticity
   )
 RETURNS VOID AS $$
 DECLARE
   input_table_name              VARCHAR[];
   group_cols                    VARCHAR[];
   actual_table_name             VARCHAR;
   schema_name                   VARCHAR;
   table_creation_string         VARCHAR;
   group_string                  VARCHAR;
   group_array_length            INTEGER;
   col_data_type                 VARCHAR;
   each_group                    INTEGER;
   linregr_fitting_rst           VARCHAR;
   old_msg_level                 TEXT;
 BEGIN

   EXECUTE 'SELECT setting FROM pg_settings WHERE name=''client_min_messages''' INTO old_msg_level;
   EXECUTE 'SET client_min_messages TO warning';

   IF (source_table IS NULL OR source_table = '') THEN
       RAISE EXCEPTION 'Invalid input table name given.';
   END IF;
   IF (out_table IS NULL OR out_table = '') THEN
       RAISE EXCEPTION 'Invalid output table name given.';
   END IF;
   IF (dependent_varname IS NULL OR dependent_varname = '') THEN
       RAISE EXCEPTION 'Invalid dependent variable name given.';
   END IF;
   IF (independent_varname IS NULL OR independent_varname = '') THEN
       RAISE EXCEPTION 'Invalid independent variable name given.';
   END IF;

   IF (NOT MADLIB_SCHEMA.__table_exists(source_table)) THEN
     RAISE EXCEPTION 'Source table name does not exist.';
   END IF;

   IF (MADLIB_SCHEMA.__table_exists(out_table)) THEN
     RAISE EXCEPTION 'Output table name already exists. Drop the table before calling the function.';
   END IF;


   -- initial validation
   IF (input_group_cols IS NULL)
       --OR array_upper(input_group_cols, 1) IS NULL
       --OR array_upper(input_group_cols, 1) = 0)
   THEN
   	  PERFORM MADLIB_SCHEMA.__internal_linregr_train_hetero(
                     source_table, out_table,
                     dependent_varname, independent_varname,
                     heteroskedasticity_option);
   ELSE
       group_cols = MADLIB_SCHEMA._string_to_array(input_group_cols);
       -- create output table
       EXECUTE 'DROP TABLE IF EXISTS ' || out_table;
       table_creation_string := 'CREATE TABLE ' || out_table || '(';
       group_array_length = array_upper(group_cols, 1);

       input_table_name = regexp_split_to_array(source_table, E'\\.');
       IF array_upper(input_table_name, 1) = 1 THEN
         actual_table_name = input_table_name[1];
         schema_name  := current_schema();
       ELSIF array_upper(input_table_name, 1) = 2 THEN
         actual_table_name = input_table_name[2];
         schema_name  = input_table_name[1];
       ELSE
         RAISE EXCEPTION 'Incorrect input source table name provided';
       END IF;

       -- Check that each grouping column exists
       FOR each_group in 1 .. group_array_length
       LOOP
         if NOT MADLIB_SCHEMA.check_if_col_exists(source_table,
                                       group_cols[each_group]) THEN
               RAISE EXCEPTION 'Grouping column % does not exist',
                                       group_cols[each_group];
         END IF;


       END LOOP;

       FOR each_group in 1 .. group_array_length
       LOOP
           -- create a string that makes list of
           EXECUTE 'SELECT data_type FROM information_schema.columns
                    WHERE
                         table_schema = ''' || schema_name || '''
                         AND table_name = ''' || actual_table_name || '''
                         AND column_name= ''' || group_cols[each_group] || ''''
           INTO col_data_type;

           table_creation_string := table_creation_string
                                   || group_cols[each_group]
                                   || ' ' || col_data_type || ',';
       END LOOP;

       -- finish creating the output table
       EXECUTE table_creation_string || '
               coef DOUBLE PRECISION[],
               r2 DOUBLE PRECISION,
               std_err DOUBLE PRECISION[],
               t_stats DOUBLE PRECISION[],
               p_values DOUBLE PRECISION[],
               condition_no DOUBLE PRECISION)';
       IF heteroskedasticity_option THEN
         EXECUTE 'ALTER TABLE ' || out_table || '
                 ADD COLUMN bp_stats DOUBLE PRECISION,
                 ADD COLUMN bp_p_value DOUBLE PRECISION';
       END IF;

       group_string := '';
       FOR each_group in 1 .. (group_array_length-1)
       LOOP
           group_string := group_string || group_cols[each_group] || ',';
       END LOOP;
       group_string := group_string || group_cols[group_array_length];

       IF heteroskedasticity_option THEN
         linregr_fitting_rst := MADLIB_SCHEMA.__unique_string();
         EXECUTE '
             DROP TABLE IF EXISTS '|| linregr_fitting_rst ||';
             CREATE TEMP TABLE '|| linregr_fitting_rst ||' AS
                 SELECT
                     '|| group_string ||',
                     (MADLIB_SCHEMA.linregr('|| dependent_varname ||','|| independent_varname ||')).*
                 FROM '|| source_table ||'
                 GROUP BY '|| group_string;

         EXECUTE '
           INSERT INTO ' || out_table || '
             SELECT *
             FROM
                 '|| linregr_fitting_rst ||'
                 JOIN (
                     SELECT
                         '|| group_string ||',
                         (MADLIB_SCHEMA.heteroskedasticity_test_linregr('
                             || dependent_varname    || ','
                             || independent_varname  || ', t.coef)).*
                     FROM
                         '|| source_table ||' AS s
                         JOIN
                         '|| linregr_fitting_rst ||' AS t
                     USING ('  || group_string || ')
                     GROUP BY '  || group_string ||') z
                 USING ('|| group_string ||')';

         EXECUTE 'DROP TABLE IF EXISTS '|| linregr_fitting_rst;
       ELSE
         EXECUTE '
           INSERT INTO ' || out_table || '
             SELECT ' || group_string || ', (result).coef, (result).r2,
                     (result).std_err, (result).t_stats,
                     (result).p_values, (result).condition_no
             FROM (
               SELECT ' || group_string ||
                     ', MADLIB_SCHEMA.linregr( '
                         || dependent_varname || ' , '
                         || independent_varname || ' )
                       AS result
               FROM ' || source_table || '
               GROUP BY ' || group_string ||
             ') subq';
       END IF;
   END IF;

   EXECUTE 'SET client_min_messages TO '|| old_msg_level;
 END;
 $$ LANGUAGE plpgsql VOLATILE;
 ---------------------------------------------------------------------------
 /**
   * @brief Linear regression training function with grouping support.
  **/
 CREATE FUNCTION MADLIB_SCHEMA.linregr_train(
      source_table               VARCHAR       -- name of input  table
    , out_table                  VARCHAR       -- name of output table
    , dependent_varname          VARCHAR       -- name of dependent variable
    , independent_varname        VARCHAR       -- name of independent variable
    , group_cols                 VARCHAR       -- names of columns to group-by
   )
 RETURNS VOID AS $$
 BEGIN
   PERFORM MADLIB_SCHEMA.linregr_train(  source_table, out_table,
                                         dependent_varname,
                                         independent_varname, group_cols, FALSE);
 END;
 $$ LANGUAGE plpgsql VOLATILE;
 ---------------------------------------------------------------------------