blob: 091b5f19580e866274c1b5382cf160dc74199c21 [file] [log] [blame]
/* ----------------------------------------------------------------------- *//**
*
* @file linear.sql_in
*
* @brief SQL functions for linear regression
* @date January 2011
*
* @sa For a brief introduction to linear regression, see the module
* description \ref grp_linreg.
*
*//* ----------------------------------------------------------------------- */
m4_include(`SQLCommon.m4')
/**
@addtogroup grp_linreg
<div class="toc"><b>Contents</b><ul>
<li class="level1"><a href="#about">About</a></li>
<li class="level1"><a href="#train">Usage</a></li>
<li class="level2"><a href="#train">Training Function</a></li>
<li class="level2"><a href="#output">Output Table</a></li>
<li class="level2"><a href="#predict">Prediction Function</a></li>
<li class="level1"><a href="#examples">Examples</a></li>
<li class="level1"><a href="#seealso">See Also</a></li>
<li class="level1"><a href="#background">Technical Background</a></li>
<li class="level1"><a href="#literature">Literature</a></li>
</ul></div>
@anchor about
@about
Ordinary Least Squares Regression, also called Linear Regression, is a
statistical model used to fit linear models.
It models a linear relationship of a scalar dependent variable \f$ y \f$ to one
or more explanatory independent variables \f$ x \f$ to build a
model of coefficients.
@anchor train
@par Training Function
@verbatim
linregr_train(source_table, out_table,
dependent_varname,
independent_varname,
input_group_cols := NULL,
heteroskedasticity_option := NULL)
@endverbatim
<DL class="arglist">
<DT>source_table</DT>
<DD>Text value. The name of the table containing the training data.</DD>
<DT>out_table</DT>
<DD>Text value. Name of the generated table containing the output model.</DD>
<DT>dependent_varname</DT>
<DD>Text value. Expression to evaluate for the dependent variable.</DD>
<DT>independent_varname</DT>
<DD>Text value. Expression list to evaluate for the independent variables. An intercept variable is not assumed. It is common to provide an explicit intercept term by including a single constant <tt>1</tt> term in the independent variable list.</DD>
<DT>input_group_cols</DT>
<DD>Text value. An expression list used to group the input dataset into discrete groups, running one regression per group. Similar to the SQL <tt>GROUP BY</tt> clause. When this value is null, no grouping is used and a single result model is generated. Default value: NULL.</DD>
<DT>heteroskedasticity_option</DT>
<DD>Boolean value. When True, the heteroskedacity of the model is also calculated and returned with the results. Default value: False.</DD>
</DL>
@anchor output
@par Output Table
The output table produced by the linear regression training function contains the following columns.
<DL class="arglist">
<DT>\<...></DT>
<DD>Any grouping columns provided during training.
Present only if the grouping option is used.</DD>
<DT>coef</DT>
<DD>Float array. Vector of the coefficients of the regression.</DD>
<DT>r2</DT>
<DD>Float. R-squared coefficient of determination of the model.</DD>
<DT>std_err</DT>
<DD>Float array. Vector of the standard error of the coefficients.</DD>
<DT>t_stats</DT>
<DD>Float array. Vector of the t-statistics of the coefficients.</DD>
<DT>p_values</DT>
<DD>Float array. Vector of the p-values of the coefficients.</DD>
<DT>condition_no</DT>
<DD>Float array. The condition number of the \f$X^{*}X\f$
matrix. A high condition number is usually an indication that there may be
some numeric instability in the result yielding a less reliable model. A high
condition number often results when there is a significant amount of
colinearity in the underlying design matrix, in which case other regression
techniques, such as elastic net regression, may be more appropriate.</DD>
<DT>bp_stats</DT>
<DD>Float. The Breush-Pagan statistic of heteroskedacity. Present
only if the heteroskedacity argument was set to True when the model was
trained.</DD>
<DT>bp_p_value</DT>
<DD>Float. The Breush-Pagan calculated p-value. Present only if
the heteroskedacity parameter was set to True when the model was
trained.</DD>
</DL>
@anchor predict
@par Prediction Function
@verbatim
linregr_predict(
coef,
col_ind
)
@endverbatim
<dl class="arglist">
<dt>coef</dt>
<dd>Float array. Vector of the coefficients of regression.</dd>
<dt>col_ind</dt>
<dd>Float array. An array containing the independent variable column names. </dd>
@anchor examples
@par Examples
-# Create an input data set.
@verbatim
CREATE TABLE houses (id INT, tax INT, bedroom INT, bath FLOAT, price INT,
size INT, lot INT);
COPY houses FROM STDIN WITH DELIMITER '|';
1 | 590 | 2 | 1 | 50000 | 770 | 22100
2 | 1050 | 3 | 2 | 85000 | 1410 | 12000
3 | 20 | 3 | 1 | 22500 | 1060 | 3500
4 | 870 | 2 | 2 | 90000 | 1300 | 17500
5 | 1320 | 3 | 2 | 133000 | 1500 | 30000
6 | 1350 | 2 | 1 | 90500 | 820 | 25700
7 | 2790 | 3 | 2.5 | 260000 | 2130 | 25000
8 | 680 | 2 | 1 | 142500 | 1170 | 22000
9 | 1840 | 3 | 2 | 160000 | 1500 | 19000
10 | 3680 | 4 | 2 | 240000 | 2790 | 20000
11 | 1660 | 3 | 1 | 87000 | 1030 | 17500
12 | 1620 | 3 | 2 | 118600 | 1250 | 20000
13 | 3100 | 3 | 2 | 140000 | 1760 | 38000
14 | 2070 | 2 | 3 | 148000 | 1550 | 14000
15 | 650 | 3 | 1.5 | 65000 | 1450 | 12000
\.
@endverbatim
-# Train a regression model.
@verbatim
-- A single regression for all the data
SELECT madlib.linregr_train(
'houses', 'houses_linregr', 'price', 'array[1, tax, bath, size]');
-- 3 output models, one for each value of "bedroom"
SELECT madlib.linregr_train(
'houses', 'houses_linregr_bedroom', 'price', 'array[1, tax, bath, size]',
'bedroom');
@endverbatim
-# Examine the resulting models.
@verbatim
-- Set extended display on for easier reading of output
\x on
SELECT * from houses_linregr;
SELECT * FROM houses_linregr_bedroom;
-- Alternatively you can unnest the results for easier reading of output
\x off
SELECT unnest(array['intercept','tax','bath','size']) as attribute,
unnest(coef) as coefficient,
unnest(std_err) as standard_error,
unnest(t_stats) as t_stat,
unnest(p_values) as pvalue
FROM houses_linregr;
@endverbatim
-# Use the prediction function to evaluate residuals.
@verbatim
SELECT houses.*,
madlib.linregr_predict(array[1,tax,bath,size], m.coef) as predict,
price - madlib.linregr_predict(array[1,tax,bath,size], m.coef)
as residual
FROM houses, houses_linregr m;
@endverbatim
@anchor seealso
@sa File linear.sql_in, documenting the SQL training functions
@sa linregr()
@sa logregr_train()
@sa elastic_net_train()
@sa grp_robust
@sa grp_clustered_errors
@sa grp_validation
@anchor background
@par Technical Background
Ordinary least-squares (OLS) linear regression refers to a stochastic model in
which the conditional mean of the dependent variable (usually denoted \f$ Y \f$)
is an affine function of the vector of independent variables (usually denoted
\f$ \boldsymbol x \f$). That is,
\f[
E[Y \mid \boldsymbol x] = \boldsymbol c^T \boldsymbol x
\f]
for some unknown vector of coefficients \f$ \boldsymbol c \f$. The assumption is
that the residuals are i.i.d. distributed Gaussians. That is, the (conditional)
probability density of \f$ Y \f$ is given by
\f[
f(y \mid \boldsymbol x)
= \frac{1}{\sqrt{2 \pi \sigma^2}}
\cdot \exp\left(-\frac{1}{2 \sigma^2}
\cdot (y - \boldsymbol x^T \boldsymbol c)^2 \right)
\,.
\f]
OLS linear regression finds the vector of coefficients \f$ \boldsymbol c \f$
that maximizes the likelihood of the observations.
Let
- \f$ \boldsymbol y \in \mathbf R^n \f$ denote the vector of observed dependent
variables, with \f$ n \f$ rows, containing the observed values of the
dependent variable,
- \f$ X \in \mathbf R^{n \times k} \f$ denote the design matrix with \f$ k \f$
columns and \f$ n \f$ rows, containing all observed vectors of independent
variables.
\f$ \boldsymbol x_i \f$ as rows,
- \f$ X^T \f$ denote the transpose of \f$ X \f$,
- \f$ X^+ \f$ denote the pseudo-inverse of \f$ X \f$.
Maximizing the likelihood is equivalent to maximizing the log-likelihood
\f$ \sum_{i=1}^n \log f(y_i \mid \boldsymbol x_i) \f$, which simplifies to
minimizing the <b>residual sum of squares</b> \f$ RSS \f$ (also called sum of
squared residuals or sum of squared errors of prediction),
\f[
RSS = \sum_{i=1}^n ( y_i - \boldsymbol c^T \boldsymbol x_i )^2
= (\boldsymbol y - X \boldsymbol c)^T (\boldsymbol y - X \boldsymbol c)
\,.
\f]
The first-order conditions yield that the \f$ RSS \f$ is minimized at
\f[
\boldsymbol c = (X^T X)^+ X^T \boldsymbol y
\,.
\f]
Computing the <b>total sum of squares</b> \f$ TSS \f$, the <b>explained
sum of squares</b> \f$ ESS \f$ (also called the regression sum of
squares), and the <b>coefficient of determination</b> \f$ R^2 \f$ is
done according to the following formulas:
\f{align*}{
ESS & = \boldsymbol y^T X \boldsymbol c
- \frac{ \| y \|_1^2 }{n} \\
TSS & = \sum_{i=1}^n y_i^2
- \frac{ \| y \|_1^2 }{n} \\
R^2 & = \frac{ESS}{TSS}
\f}
Note: The last equality follows from the definition
\f$ R^2 = 1 - \frac{RSS}{TSS} \f$ and the fact that for linear regression
\f$ TSS = RSS + ESS \f$. A proof of the latter can be found, e.g., at:
http://en.wikipedia.org/wiki/Sum_of_squares
We estimate the variance
\f$ Var[Y - \boldsymbol c^T \boldsymbol x \mid \boldsymbol x] \f$ as
\f[
\sigma^2 = \frac{RSS}{n - k}
\f]
and compute the t-statistic for coefficient \f$ i \f$ as
\f[
t_i = \frac{c_i}{\sqrt{\sigma^2 \cdot \left( (X^T X)^{-1} \right)_{ii} }}
\,.
\f]
The \f$ p \f$-value for coefficient \f$ i \f$ gives the probability of seeing a
value at least as extreme as the one observed, provided that the null hypothesis
(\f$ c_i = 0 \f$) is true. Letting \f$ F_\nu \f$ denote the
cumulative density function of student-t with \f$ \nu \f$ degrees of freedom,
the \f$ p \f$-value for coefficient \f$ i \f$
is therefore
\f[
p_i = \Pr(|T| \geq |t_i|) = 2 \cdot (1 - F_{n - k}( |t_i| ))
\f]
where \f$ T \f$ is a student-t distributed random variable with mean 0.
The condition number [2] \f$ \kappa(X) = \|X\|_2\cdot\|X^{-1}\|_2\f$ is computed
as the product of two spectral norms [3]. The spectral norm of a matrix \f$X\f$
is the largest singular value of \f$X\f$ i.e. the square root of the largest
eigenvalue of the positive-semidefinite matrix \f$X^{*}X\f$:
\f[
\|X\|_2 = \sqrt{\lambda_{\max}\left(X^{*}X\right)}\ ,
\f]
where \f$X^{*}\f$ is the conjugate transpose of \f$X\f$.
The condition number of a linear regression problem
is a worst-case measure of how sensitive the
result is to small perturbations of the input. A large condition number (say,
more than 1000) indicates the presence of significant multicollinearity.
@anchor literature
@literature
[1] Cosma Shalizi: Statistics 36-350: Data Mining, Lecture Notes, 21 October
2009, http://www.stat.cmu.edu/~cshalizi/350/lectures/17/lecture-17.pdf
[2] Wikipedia: Condition Number, http://en.wikipedia.org/wiki/Condition_number.
[3] Wikipedia: Spectral Norm,
http://en.wikipedia.org/wiki/Spectral_norm#Spectral_norm
[4] Wikipedia: Breusch–Pagan test,
http://en.wikipedia.org/wiki/Breusch%E2%80%93Pagan_test
[5] Wikipedia: Heteroscedasticity-consistent standard errors,
http://en.wikipedia.org/wiki/Heteroscedasticity-consistent_standard_errors
@internal
@sa Namespace \ref madlib::modules::regress
documenting the implementation in C++
@endinternal
*/
---------------------------------------------------------------------------
CREATE TYPE MADLIB_SCHEMA.linregr_result AS (
coef DOUBLE PRECISION[],
r2 DOUBLE PRECISION,
std_err DOUBLE PRECISION[],
t_stats DOUBLE PRECISION[],
p_values DOUBLE PRECISION[],
condition_no DOUBLE PRECISION
);
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linregr_transition(
state MADLIB_SCHEMA.bytea8,
y DOUBLE PRECISION,
x DOUBLE PRECISION[])
RETURNS MADLIB_SCHEMA.bytea8
AS 'MODULE_PATHNAME'
LANGUAGE C
IMMUTABLE STRICT;
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linregr_merge_states(
state1 MADLIB_SCHEMA.bytea8,
state2 MADLIB_SCHEMA.bytea8)
RETURNS MADLIB_SCHEMA.bytea8
AS 'MODULE_PATHNAME'
LANGUAGE C
IMMUTABLE STRICT;
-- Final functions
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linregr_final(
state MADLIB_SCHEMA.bytea8)
RETURNS MADLIB_SCHEMA.linregr_result
AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE STRICT;
CREATE TYPE MADLIB_SCHEMA.heteroskedasticity_test_result AS (
bp_stats DOUBLE PRECISION,
bp_p_value DOUBLE PRECISION
);
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.hetero_linregr_transition(
state MADLIB_SCHEMA.bytea8,
y DOUBLE PRECISION,
x DOUBLE PRECISION[],
coef DOUBLE PRECISION[])
RETURNS MADLIB_SCHEMA.bytea8
AS 'MODULE_PATHNAME'
LANGUAGE C
IMMUTABLE STRICT;
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.hetero_linregr_merge_states(
state1 MADLIB_SCHEMA.bytea8,
state2 MADLIB_SCHEMA.bytea8)
RETURNS MADLIB_SCHEMA.bytea8
AS 'MODULE_PATHNAME'
LANGUAGE C
IMMUTABLE STRICT;
-- Final functions
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.hetero_linregr_final(
state MADLIB_SCHEMA.bytea8)
RETURNS MADLIB_SCHEMA.heteroskedasticity_test_result
AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE STRICT;
/**
* @brief Compute studentized Breuch-Pagan heteroskedasticity test for
* linear regression.
*
* @param dependentVariable Column containing the dependent variable
* @param independentVariables Column containing the array of independent variables
* @param olsCoefficients Column containing the array of the OLS coefficients (as obtained by linregr)
*
* @par
* To include an intercept in the model, set one coordinate in the
* <tt>independentVariables</tt> array to 1.
*
* @return A composite value:
* - <tt>test_statistic FLOAT8[]</tt> - Prob > test_statistc
* - <tt>p_value FLOAT8[]</tt> - Prob > test_statistc
*
* @usage
* <pre> SELECT (heteoskedasticity_test_linregr(<em>dependentVariable</em>,
* <em>independentVariables</em>, coef)).*
* FROM (
* SELECT linregr(<em>dependentVariable</em>, <em>independentVariables</em>).coef
* ) AS ols_coef, <em>sourceName</em> as src;
* </pre>
*/
CREATE AGGREGATE MADLIB_SCHEMA.heteroskedasticity_test_linregr(
/*+ "dependentVariable" */ DOUBLE PRECISION,
/*+ "independentVariables" */ DOUBLE PRECISION[],
/*+ "olsCoefficients" */ DOUBLE PRECISION[]) (
SFUNC=MADLIB_SCHEMA.hetero_linregr_transition,
STYPE=MADLIB_SCHEMA.bytea8,
FINALFUNC=MADLIB_SCHEMA.hetero_linregr_final,
m4_ifdef(`GREENPLUM',`prefunc=MADLIB_SCHEMA.hetero_linregr_merge_states,')
INITCOND=''
);
/**
* @brief Compute linear regression coefficients and diagnostic statistics.
*
* @param dependentVariable Column containing the dependent variable
* @param independentVariables Column containing the array of independent variables
*
* @par
* To include an intercept in the model, set one coordinate in the
* <tt>independentVariables</tt> array to 1.
*
* @return A composite value:
* - <tt>coef FLOAT8[]</tt> - Array of coefficients, \f$ \boldsymbol c \f$
* - <tt>r2 FLOAT8</tt> - Coefficient of determination, \f$ R^2 \f$
* - <tt>std_err FLOAT8[]</tt> - Array of standard errors,
* \f$ \mathit{se}(c_1), \dots, \mathit{se}(c_k) \f$
* - <tt>t_stats FLOAT8[]</tt> - Array of t-statistics, \f$ \boldsymbol t \f$
* - <tt>p_values FLOAT8[]</tt> - Array of p-values, \f$ \boldsymbol p \f$
* - <tt>condition_no FLOAT8</tt> - The condition number of matrix
* \f$ X^T X \f$.
*
* @usage
* - Get vector of coefficients \f$ \boldsymbol c \f$ and all diagnostic
* statistics:\n
* <pre>SELECT (linregr(<em>dependentVariable</em>,
* <em>independentVariables</em>)).*
*FROM <em>sourceName</em>;</pre>
* - Get vector of coefficients \f$ \boldsymbol c \f$:\n
* <pre>SELECT (linregr(<em>dependentVariable</em>,
* <em>independentVariables</em>)).coef
*FROM <em>sourceName</em>;</pre>
* - Get a subset of the output columns, e.g., only the array of coefficients
* \f$ \boldsymbol c \f$, the coefficient of determination \f$ R^2 \f$, and
* the array of p-values \f$ \boldsymbol p \f$:
* <pre>SELECT (lr).coef, (lr).r2, (lr).p_values
*FROM (
* SELECT linregr( <em>dependentVariable</em>,
* <em>independentVariables</em>) AS lr
* FROM <em>sourceName</em>
*) AS subq;</pre>
*/
CREATE AGGREGATE MADLIB_SCHEMA.linregr(
/*+ "dependentVariable" */ DOUBLE PRECISION,
/*+ "independentVariables" */ DOUBLE PRECISION[]) (
SFUNC=MADLIB_SCHEMA.linregr_transition,
STYPE=MADLIB_SCHEMA.bytea8,
FINALFUNC=MADLIB_SCHEMA.linregr_final,
m4_ifdef(`__GREENPLUM__',`prefunc=MADLIB_SCHEMA.linregr_merge_states,')
INITCOND=''
);
--------------------------- INTERNAL ---------------------------------------
/**
* @brief Return heteroskedasticity values for specific linear regression
* coefficients
**/
CREATE FUNCTION MADLIB_SCHEMA.__internal_get_hsk_result(
source_table VARCHAR -- name of input table
, dependent_varname VARCHAR -- name of dependent variable
, independent_varname VARCHAR -- name of independent variable
, linregr_coeffs DOUBLE PRECISION[] -- coeffs from linear regression
)
RETURNS MADLIB_SCHEMA.heteroskedasticity_test_result AS $$
DECLARE
hsk_value MADLIB_SCHEMA.heteroskedasticity_test_result;
BEGIN
EXECUTE '
SELECT (MADLIB_SCHEMA.heteroskedasticity_test_linregr('
|| dependent_varname || ' , '
|| independent_varname || ' , '
|| 'ARRAY[' || array_to_string(linregr_coeffs, ',') || '])).*
FROM ' || source_table
INTO hsk_value;
RETURN hsk_value;
END
$$ LANGUAGE plpgsql VOLATILE;
/**
* @brief Return linear regression output for source data
*
**/
CREATE FUNCTION MADLIB_SCHEMA.__internal_get_linreg_result(
source_table VARCHAR -- name of input table
, dependent_varname VARCHAR -- name of dependent variable
, independent_varname VARCHAR -- name of independent variable
)
RETURNS MADLIB_SCHEMA.linregr_result AS $$
DECLARE
lin_rst MADLIB_SCHEMA.linregr_result;
BEGIN
EXECUTE '
SELECT (MADLIB_SCHEMA.linregr( '
|| dependent_varname || ' , '
|| independent_varname || ')
).*
FROM ' || source_table
INTO lin_rst;
RETURN lin_rst;
END
$$ LANGUAGE plpgsql VOLATILE;
CREATE FUNCTION MADLIB_SCHEMA.__internal_get_linregr_insert_string(
lin_rst MADLIB_SCHEMA.linregr_result,
out_table TEXT
)
RETURNS VARCHAR AS $$
DECLARE
insert_string VARCHAR;
BEGIN
insert_string := 'INSERT INTO ' || out_table || ' VALUES (';
insert_string := insert_string ||
CASE
WHEN (lin_rst).coef is NULL
THEN '''{}'','
ELSE 'ARRAY[' || array_to_string((lin_rst).coef, ',') || '], '
END ||
CASE
WHEN (lin_rst).r2 is NULL
THEN '0.0,'
ELSE (lin_rst).r2 || ','
END ||
CASE
WHEN (lin_rst).std_err is NULL
THEN '''{}'','
ELSE 'ARRAY[' || array_to_string((lin_rst).std_err, ',') || '], '
END ||
CASE
WHEN (lin_rst).t_stats is NULL
THEN '''{}'','
ELSE 'ARRAY[' || array_to_string((lin_rst).t_stats, ',') || '], '
END ||
CASE
WHEN (lin_rst).p_values is NULL
THEN '''{}'','
ELSE 'ARRAY[' || array_to_string((lin_rst).p_values, ',') || '], '
END ||
CASE
WHEN (lin_rst).condition_no is NULL
THEN '0.0'
ELSE (lin_rst).condition_no
END;
RETURN insert_string;
END;
$$ LANGUAGE plpgsql VOLATILE;
/**
* @brief Compute linear regression coefficients and heterskedasticity values
*
**/
CREATE FUNCTION MADLIB_SCHEMA.__internal_linregr_train_hetero(
source_table VARCHAR -- name of input table
, out_table VARCHAR -- name of output table
, dependent_varname VARCHAR -- name of dependent variable
, independent_varname VARCHAR -- name of independent variable
, heteroskedasticity_option BOOLEAN -- do you want heteroskedasticity output
)
RETURNS VOID AS $$
DECLARE
insert_string VARCHAR;
lin_rst MADLIB_SCHEMA.linregr_result;
hsk_value MADLIB_SCHEMA.heteroskedasticity_test_result;
BEGIN
IF (source_table IS NULL OR source_table = '') THEN
RAISE EXCEPTION 'Invalid input table name given.';
END IF;
IF (out_table IS NULL OR out_table = '') THEN
RAISE EXCEPTION 'Invalid output table name given.';
END IF;
IF (dependent_varname IS NULL OR dependent_varname = '') THEN
RAISE EXCEPTION 'Invalid dependent variable name given.';
END IF;
IF (independent_varname IS NULL OR independent_varname = '') THEN
RAISE EXCEPTION 'Invalid independent variable name given.';
END IF;
IF (MADLIB_SCHEMA.__table_exists(out_table)) THEN
RAISE EXCEPTION 'Output table name already exists. Drop the table before calling the function.';
END IF;
-- create output table with appropriate column names
EXECUTE 'DROP TABLE IF EXISTS ' || out_table;
EXECUTE '
CREATE TABLE ' || out_table || ' (
coef DOUBLE PRECISION[],
r2 DOUBLE PRECISION,
std_err DOUBLE PRECISION[],
t_stats DOUBLE PRECISION[],
p_values DOUBLE PRECISION[],
condition_no DOUBLE PRECISION)';
IF heteroskedasticity_option THEN
-- Alter output table to add heteroskedasticity values
EXECUTE '
ALTER TABLE ' || out_table || '
ADD COLUMN bp_stats DOUBLE PRECISION,
ADD COLUMN bp_p_value DOUBLE PRECISION';
END IF;
-- compute linear regression and heteroskedasticity values (if required)
lin_rst := MADLIB_SCHEMA.__internal_get_linreg_result(
source_table, dependent_varname, independent_varname);
insert_string := MADLIB_SCHEMA.__internal_get_linregr_insert_string(
lin_rst, out_table);
-- Ensure Infinity and NaN are cast properly
insert_string := REGEXP_REPLACE(insert_string, 'Infinity',
'''Infinity''::double precision', 'gi');
insert_string := REGEXP_REPLACE(insert_string, 'NaN',
'''NaN''::double precision', 'gi');
IF heteroskedasticity_option THEN
-- add hsk values in the sql string and execute
hsk_value := MADLIB_SCHEMA.__internal_get_hsk_result(
source_table, dependent_varname,
independent_varname, (lin_rst).coef);
EXECUTE
insert_string || ','
|| (hsk_value).bp_stats || ','
|| (hsk_value).bp_p_value || ')';
ELSE
-- complete the sql string and execute
EXECUTE insert_string || ')';
END IF;
END
$$ LANGUAGE plpgsql VOLATILE;
---------------------------------------------------------------------------
/**
* @brief Compute linear regression coefficients and insert into an output
* table. The function drops the table if it already exists before creating
* the table.
*
**/
CREATE FUNCTION MADLIB_SCHEMA.linregr_train(
source_table VARCHAR -- name of input table
, out_table VARCHAR -- name of output table
, dependent_varname VARCHAR -- name of dependent variable
, independent_varname VARCHAR -- name of independent variable
)
RETURNS VOID AS $$
BEGIN
PERFORM MADLIB_SCHEMA.__internal_linregr_train_hetero(
source_table, out_table, dependent_varname, independent_varname, False);
-- RAISE NOTICE '
-- Finished linear regression
-- * table : % (%, %)
-- Output:
-- * view : SELECT * FROM % ;', source_table, dependent_varname,
-- independent_varname, out_table;
END;
$$ LANGUAGE plpgsql VOLATILE;
--------------------- GROUPING ---------------------------------------------
/**
* @brief Linear regression training function with grouping support and
* option for heteroskedasticity values.
**/
CREATE FUNCTION MADLIB_SCHEMA.linregr_train(
source_table VARCHAR -- name of input table
, out_table VARCHAR -- name of output table
, dependent_varname VARCHAR -- name of dependent variable
, independent_varname VARCHAR -- name of independent variable
, input_group_cols VARCHAR -- names of columns to group-by
, heteroskedasticity_option BOOLEAN -- heteroskedasticity
)
RETURNS VOID AS $$
DECLARE
input_table_name VARCHAR[];
group_cols VARCHAR[];
actual_table_name VARCHAR;
schema_name VARCHAR;
table_creation_string VARCHAR;
group_string VARCHAR;
group_array_length INTEGER;
col_data_type VARCHAR;
each_group INTEGER;
linregr_fitting_rst VARCHAR;
old_msg_level TEXT;
BEGIN
EXECUTE 'SELECT setting FROM pg_settings WHERE name=''client_min_messages''' INTO old_msg_level;
EXECUTE 'SET client_min_messages TO warning';
IF (source_table IS NULL OR source_table = '') THEN
RAISE EXCEPTION 'Invalid input table name given.';
END IF;
IF (out_table IS NULL OR out_table = '') THEN
RAISE EXCEPTION 'Invalid output table name given.';
END IF;
IF (dependent_varname IS NULL OR dependent_varname = '') THEN
RAISE EXCEPTION 'Invalid dependent variable name given.';
END IF;
IF (independent_varname IS NULL OR independent_varname = '') THEN
RAISE EXCEPTION 'Invalid independent variable name given.';
END IF;
IF (NOT MADLIB_SCHEMA.__table_exists(source_table)) THEN
RAISE EXCEPTION 'Source table name does not exist.';
END IF;
IF (MADLIB_SCHEMA.__table_exists(out_table)) THEN
RAISE EXCEPTION 'Output table name already exists. Drop the table before calling the function.';
END IF;
-- initial validation
IF (input_group_cols IS NULL)
--OR array_upper(input_group_cols, 1) IS NULL
--OR array_upper(input_group_cols, 1) = 0)
THEN
PERFORM MADLIB_SCHEMA.__internal_linregr_train_hetero(
source_table, out_table,
dependent_varname, independent_varname,
heteroskedasticity_option);
ELSE
group_cols = MADLIB_SCHEMA._string_to_array(input_group_cols);
-- create output table
EXECUTE 'DROP TABLE IF EXISTS ' || out_table;
table_creation_string := 'CREATE TABLE ' || out_table || '(';
group_array_length = array_upper(group_cols, 1);
input_table_name = regexp_split_to_array(source_table, E'\\.');
IF array_upper(input_table_name, 1) = 1 THEN
actual_table_name = input_table_name[1];
schema_name := current_schema();
ELSIF array_upper(input_table_name, 1) = 2 THEN
actual_table_name = input_table_name[2];
schema_name = input_table_name[1];
ELSE
RAISE EXCEPTION 'Incorrect input source table name provided';
END IF;
-- Check that each grouping column exists
FOR each_group in 1 .. group_array_length
LOOP
if NOT MADLIB_SCHEMA.check_if_col_exists(source_table,
group_cols[each_group]) THEN
RAISE EXCEPTION 'Grouping column % does not exist',
group_cols[each_group];
END IF;
END LOOP;
FOR each_group in 1 .. group_array_length
LOOP
-- create a string that makes list of
EXECUTE 'SELECT data_type FROM information_schema.columns
WHERE
table_schema = ''' || schema_name || '''
AND table_name = ''' || actual_table_name || '''
AND column_name= ''' || group_cols[each_group] || ''''
INTO col_data_type;
table_creation_string := table_creation_string
|| group_cols[each_group]
|| ' ' || col_data_type || ',';
END LOOP;
-- finish creating the output table
EXECUTE table_creation_string || '
coef DOUBLE PRECISION[],
r2 DOUBLE PRECISION,
std_err DOUBLE PRECISION[],
t_stats DOUBLE PRECISION[],
p_values DOUBLE PRECISION[],
condition_no DOUBLE PRECISION)';
IF heteroskedasticity_option THEN
EXECUTE 'ALTER TABLE ' || out_table || '
ADD COLUMN bp_stats DOUBLE PRECISION,
ADD COLUMN bp_p_value DOUBLE PRECISION';
END IF;
group_string := '';
FOR each_group in 1 .. (group_array_length-1)
LOOP
group_string := group_string || group_cols[each_group] || ',';
END LOOP;
group_string := group_string || group_cols[group_array_length];
IF heteroskedasticity_option THEN
linregr_fitting_rst := MADLIB_SCHEMA.__unique_string();
EXECUTE '
DROP TABLE IF EXISTS '|| linregr_fitting_rst ||';
CREATE TEMP TABLE '|| linregr_fitting_rst ||' AS
SELECT
'|| group_string ||',
(MADLIB_SCHEMA.linregr('|| dependent_varname ||','|| independent_varname ||')).*
FROM '|| source_table ||'
GROUP BY '|| group_string;
EXECUTE '
INSERT INTO ' || out_table || '
SELECT *
FROM
'|| linregr_fitting_rst ||'
JOIN (
SELECT
'|| group_string ||',
(MADLIB_SCHEMA.heteroskedasticity_test_linregr('
|| dependent_varname || ','
|| independent_varname || ', t.coef)).*
FROM
'|| source_table ||' AS s
JOIN
'|| linregr_fitting_rst ||' AS t
USING (' || group_string || ')
GROUP BY ' || group_string ||') z
USING ('|| group_string ||')';
EXECUTE 'DROP TABLE IF EXISTS '|| linregr_fitting_rst;
ELSE
EXECUTE '
INSERT INTO ' || out_table || '
SELECT ' || group_string || ', (result).coef, (result).r2,
(result).std_err, (result).t_stats,
(result).p_values, (result).condition_no
FROM (
SELECT ' || group_string ||
', MADLIB_SCHEMA.linregr( '
|| dependent_varname || ' , '
|| independent_varname || ' )
AS result
FROM ' || source_table || '
GROUP BY ' || group_string ||
') subq';
END IF;
END IF;
EXECUTE 'SET client_min_messages TO '|| old_msg_level;
END;
$$ LANGUAGE plpgsql VOLATILE;
---------------------------------------------------------------------------
/**
* @brief Linear regression training function with grouping support.
**/
CREATE FUNCTION MADLIB_SCHEMA.linregr_train(
source_table VARCHAR -- name of input table
, out_table VARCHAR -- name of output table
, dependent_varname VARCHAR -- name of dependent variable
, independent_varname VARCHAR -- name of independent variable
, group_cols VARCHAR -- names of columns to group-by
)
RETURNS VOID AS $$
BEGIN
PERFORM MADLIB_SCHEMA.linregr_train( source_table, out_table,
dependent_varname,
independent_varname, group_cols, FALSE);
END;
$$ LANGUAGE plpgsql VOLATILE;
---------------------------------------------------------------------------