blob: d794d5b86493eda755d3fe4c7c01ce53a3f3ed19 [file] [log] [blame]
/* ----------------------------------------------------------------------- *//**
*
* @file linear.sql_in
*
* @brief SQL functions for linear regression
* @date January 2011
*
* @sa For a brief introduction to linear regression, see the module
* description \ref grp_linreg.
*
*//* ----------------------------------------------------------------------- */
m4_include(`SQLCommon.m4')
/**
@addtogroup grp_linreg
<div class="toc"><b>Contents</b>
<ul>
<li class="level1"><a href="#train">Training Function</a></li>
<li class="level1"><a href="#predict">Prediction Function</a></li>
<li class="level1"><a href="#examples">Examples</a></li>
<li class="level1"><a href="#background">Technical Background</a></li>
<li class="level1"><a href="#literature">Literature</a></li>
<li class="level1"><a href="#related">Related Topics</a></li>
</ul></div>
@brief Also called Ordinary Least Squares Regression, models linear relationship between a dependent variable and one or more independent variables.
Linear regression models a linear relationship of a scalar dependent variable
\f$ y \f$ to one or more explanatory independent variables \f$ x \f$ to build
a model of coefficients.
@anchor train
@par Training Function
The linear regression training function has the following syntax.
<pre class="syntax">
linregr_train( source_table,
out_table,
dependent_varname,
independent_varname,
grouping_cols,
heteroskedasticity_option
)
</pre>
\b Arguments
<DL class="arglist">
<DT>source_table</DT>
<DD>TEXT. The name of the table containing the training data.</DD>
<DT>out_table</DT>
<DD>TEXT. Name of the generated table containing the output model.
The output table contains the following columns.
<table class="output">
<tr>
<th>\<...></th>
<td>Any grouping columns provided during training.
Present only if the grouping option is used.</td>
</tr>
<tr>
<th>coef</th>
<td>FLOAT8[]. Vector of the coefficients of the regression.</td>
</tr>
<tr>
<th>r2</th>
<td>FLOAT8. R-squared coefficient of determination of the model.</td>
</tr>
<tr>
<th>std_err</th>
<td>FLOAT8[]. Vector of the standard error of the coefficients.</td>
</tr>
<tr>
<th>t_stats</th>
<td>FLOAT8[]. Vector of the t-statistics of the coefficients.</td>
</tr>
<tr>
<th>p_values</th>
<td>FLOAT8[]. Vector of the p-values of the coefficients.</td>
</tr>
<tr>
<th>condition_no</th>
<td>FLOAT8 array. The condition number of the \f$X^{*}X\f$
matrix. A high condition number is usually an indication that there may be
some numeric instability in the result yielding a less reliable model. A high
condition number often results when there is a significant amount of
colinearity in the underlying design matrix, in which case other regression
techniques, such as elastic net regression, may be more appropriate.</td>
</tr>
<tr>
<th>bp_stats</th>
<td>FLOAT8. The Breush-Pagan statistic of heteroskedacity. Present
only if the heteroskedacity argument was set to True when the model was
trained.</td>
</tr>
<tr>
<th>bp_p_value</th>
<td>FLOAT8. The Breush-Pagan calculated p-value. Present only if
the heteroskedacity parameter was set to True when the model was
trained.</td>
</tr>
<tr>
<th>num_rows_processed</th>
<td>INTEGER. The number of rows that are actually used in each group.</td>
</tr>
<tr>
<th>num_missing_rows_skipped</th>
<td>INTEGER. The number of rows that have NULL values in the dependent and independent variables, and were skipped in the computation for each group.</td>
</tr></table>
A summary table named \<out_table\>_summary is created together with the output table. It has the following columns:
<table class="output">
<tr>
<th>source_table</th>
<td>The data source table name</td></tr>
<tr><th>out_table</th>
<td>The output table name</td></tr>
<tr><th>dependent_varname</th>
<td>The dependent variable</td></tr>
<tr><th>independent_varname</th>
<td>The independent variables</td></tr>
<tr><th>num_rows_processed</th>
<td>The total number of rows that were used in the computation.</td></tr>
<tr><th>num_missing_rows_skipped</th>
<td>The total number of rows that were skipped because of NULL values in them.</td>
</tr></table>
</dd>
<dt></dt>
<dd>@note For p-values, we just return the computation result directly.
Other statistical packages, like 'R', produce the same result, but on printing the
result to screen, another format function is used and any p-value that is
smaller than the machine epsilon (the smallest positive floating-point number
'x' such that '1 + x != 1') will be printed on screen as "< xxx" (xxx is the
value of the machine epsilon). Although the result may look different, they are
in fact the same.
</dd>
<DT>dependent_varname</DT>
<DD>TEXT. Expression to evaluate for the dependent variable.</DD>
<DT>independent_varname</DT>
<DD>TEXT. Expression list to evaluate for the independent variables. An intercept variable is not assumed. It is common to provide an explicit intercept term by including a single constant <tt>1</tt> term in the independent variable list.</DD>
<DT>grouping_cols (optional)</DT>
<DD>TEXT, default: NULL. An expression list used to group the input dataset into discrete groups, running one regression per group. Similar to the SQL <tt>GROUP BY</tt> clause. When this value is null, no grouping is used and a single result model is generated.</DD>
<DT>heteroskedasticity_option (optional)</DT>
<DD>BOOLEAN, default: FALSE. When TRUE, the heteroskedasticity of the model is also calculated and returned with the results.</DD>
</DL>
@anchor warning
@warning The aggregate 'linregr' has been deprecated in favor of the function
'linregr_train'. If the aggregate 'linregr' is used to output the results of
linear regression to a table, it is recommended to follow the general pattern
shown below (replace text within '<...>' with the appropriate variable names).
<pre class="syntax">
CREATE TABLE \<output table\> AS
SELECT (r).*
FROM (
SELECT linregr(\<dependent variable\>, \<independent variable\>) as r
FROM \<source table\>
) q;
</pre>
@anchor predict
@par Prediction Function
<pre class="syntax">
linregr_predict(coef, col_ind)
</pre>
\b Arguments
<dl class="arglist">
<dt>coef</dt>
<dd>FLOAT8[]. Vector of the coefficients of regression.</dd>
<dt>col_ind</dt>
<dd>FLOAT8[]. An array containing the independent variable column names. </dd>
@anchor examples
@par Examples
-# Create an input data set.
<pre class="example">
CREATE TABLE houses (id INT, tax INT, bedroom INT, bath FLOAT, price INT,
size INT, lot INT);
COPY houses FROM STDIN WITH DELIMITER '|';
1 | 590 | 2 | 1 | 50000 | 770 | 22100
2 | 1050 | 3 | 2 | 85000 | 1410 | 12000
3 | 20 | 3 | 1 | 22500 | 1060 | 3500
4 | 870 | 2 | 2 | 90000 | 1300 | 17500
5 | 1320 | 3 | 2 | 133000 | 1500 | 30000
6 | 1350 | 2 | 1 | 90500 | 820 | 25700
7 | 2790 | 3 | 2.5 | 260000 | 2130 | 25000
8 | 680 | 2 | 1 | 142500 | 1170 | 22000
9 | 1840 | 3 | 2 | 160000 | 1500 | 19000
10 | 3680 | 4 | 2 | 240000 | 2790 | 20000
11 | 1660 | 3 | 1 | 87000 | 1030 | 17500
12 | 1620 | 3 | 2 | 118600 | 1250 | 20000
13 | 3100 | 3 | 2 | 140000 | 1760 | 38000
14 | 2070 | 2 | 3 | 148000 | 1550 | 14000
15 | 650 | 3 | 1.5 | 65000 | 1450 | 12000
\\.
</pre>
-# Train a regression model. First, a single regression for all the data.
<pre class="example">
SELECT madlib.linregr_train( 'houses',
'houses_linregr',
'price',
'ARRAY[1, tax, bath, size]'
);
</pre>
-# Generate three output models, one for each value of "bedroom".
<pre class="example">
SELECT madlib.linregr_train( 'houses',
'houses_linregr_bedroom',
'price',
'ARRAY[1, tax, bath, size]',
'bedroom'
);
</pre>
-# Examine the resulting models.
<pre class="example">
-- Set extended display on for easier reading of output
\\x ON
SELECT * FROM houses_linregr;
</pre>
Result:
<pre class="result">
-[ RECORD 1 ]+---------------------------------------------------------------------------
coef | {-12849.4168959872,28.9613922651765,10181.6290712648,50.516894915354}
r2 | 0.768577580597443
std_err | {33453.0344331391,15.8992104963997,19437.7710925923,32.928023174087}
t_stats | {-0.38410317968819,1.82156166004184,0.523806408809133,1.53416118083605}
p_values | {0.708223134615422,0.0958005827189772,0.610804093526536,0.153235085548186}
condition_no | 9002.50457085737
</pre>
-# View the results grouped by bedroom.
<pre class="example">
SELECT * FROM houses_linregr_bedroom;
</pre>
Result:
<pre class="result">
-[ RECORD 1 ]+--------------------------------------------------------------------------
bedroom | 2
coef | {-84242.0345406597,55.4430144648696,-78966.9753675319,225.611910021192}
r2 | 0.968809546465313
std_err | {35018.9991665742,19.5731125320686,23036.8071292552,49.0448678148784}
t_stats | {-2.40560942761235,2.83261103077151,-3.42786111480046,4.60011251070697}
p_values | {0.250804617665239,0.21605133377602,0.180704400437373,0.136272031474122}
condition_no | 10086.1048721726
-[ RECORD 2 ]+--------------------------------------------------------------------------
bedroom | 4
coef | {0.0112536020318378,41.4132554771633,0.0225072040636757,31.3975496688276}
r2 | 1
std_err | {0,0,0,0}
t_stats | {Infinity,Infinity,Infinity,Infinity}
p_values |
condition_no | Infinity
-[ RECORD 3 ]+--------------------------------------------------------------------------
bedroom | 3
coef | {-88155.8292501601,27.1966436294429,41404.0293363612,62.637521075324}
r2 | 0.841699901311252
std_err | {57867.9999702625,17.8272309154689,43643.1321511114,70.8506824863954}
t_stats | {-1.52339512849005,1.52556747362508,0.948695185143966,0.884077878676067}
p_values | {0.188161432894871,0.187636685729869,0.386340032374927,0.417132778705789}
condition_no | 11722.6225642147
</pre>
Alternatively you can unnest the results for easier reading of output.
<pre class="example">
\\x OFF
SELECT unnest(ARRAY['intercept','tax','bath','size']) as attribute,
unnest(coef) as coefficient,
unnest(std_err) as standard_error,
unnest(t_stats) as t_stat,
unnest(p_values) as pvalue
FROM houses_linregr;
</pre>
-# Use the prediction function to evaluate residuals.
<pre class="example">
SELECT houses.*,
madlib.linregr_predict( ARRAY[1,tax,bath,size],
m.coef
) as predict,
price -
madlib.linregr_predict( ARRAY[1,tax,bath,size],
m.coef
) as residual
FROM houses, houses_linregr m;
</pre>
@anchor notes
@par Note
All table names can be optionally schema qualified (current_schemas() would be
searched if a schema name is not provided) and all table and column names
should follow case-sensitivity and quoting rules per the database.
(For instance, 'mytable' and 'MyTable' both resolve to the same entity, i.e. 'mytable'.
If mixed-case or multi-byte characters are desired for entity names then the
string should be double-quoted; in this case the input would be '"MyTable"').
@anchor background
@par Technical Background
Ordinary least-squares (OLS) linear regression refers to a stochastic model in
which the conditional mean of the dependent variable (usually denoted \f$ Y \f$)
is an affine function of the vector of independent variables (usually denoted
\f$ \boldsymbol x \f$). That is,
\f[
E[Y \mid \boldsymbol x] = \boldsymbol c^T \boldsymbol x
\f]
for some unknown vector of coefficients \f$ \boldsymbol c \f$. The assumption is
that the residuals are i.i.d. distributed Gaussians. That is, the (conditional)
probability density of \f$ Y \f$ is given by
\f[
f(y \mid \boldsymbol x)
= \frac{1}{\sqrt{2 \pi \sigma^2}}
\cdot \exp\left(-\frac{1}{2 \sigma^2}
\cdot (y - \boldsymbol x^T \boldsymbol c)^2 \right)
\,.
\f]
OLS linear regression finds the vector of coefficients \f$ \boldsymbol c \f$
that maximizes the likelihood of the observations.
Let
- \f$ \boldsymbol y \in \mathbf R^n \f$ denote the vector of observed dependent
variables, with \f$ n \f$ rows, containing the observed values of the
dependent variable,
- \f$ X \in \mathbf R^{n \times k} \f$ denote the design matrix with \f$ k \f$
columns and \f$ n \f$ rows, containing all observed vectors of independent
variables.
\f$ \boldsymbol x_i \f$ as rows,
- \f$ X^T \f$ denote the transpose of \f$ X \f$,
- \f$ X^+ \f$ denote the pseudo-inverse of \f$ X \f$.
Maximizing the likelihood is equivalent to maximizing the log-likelihood
\f$ \sum_{i=1}^n \log f(y_i \mid \boldsymbol x_i) \f$, which simplifies to
minimizing the <b>residual sum of squares</b> \f$ RSS \f$ (also called sum of
squared residuals or sum of squared errors of prediction),
\f[
RSS = \sum_{i=1}^n ( y_i - \boldsymbol c^T \boldsymbol x_i )^2
= (\boldsymbol y - X \boldsymbol c)^T (\boldsymbol y - X \boldsymbol c)
\,.
\f]
The first-order conditions yield that the \f$ RSS \f$ is minimized at
\f[
\boldsymbol c = (X^T X)^+ X^T \boldsymbol y
\,.
\f]
Computing the <b>total sum of squares</b> \f$ TSS \f$, the <b>explained
sum of squares</b> \f$ ESS \f$ (also called the regression sum of
squares), and the <b>coefficient of determination</b> \f$ R^2 \f$ is
done according to the following formulas:
\f{align*}{
ESS & = \boldsymbol y^T X \boldsymbol c
- \frac{ \| y \|_1^2 }{n} \\
TSS & = \sum_{i=1}^n y_i^2
- \frac{ \| y \|_1^2 }{n} \\
R^2 & = \frac{ESS}{TSS}
\f}
Note: The last equality follows from the definition
\f$ R^2 = 1 - \frac{RSS}{TSS} \f$ and the fact that for linear regression
\f$ TSS = RSS + ESS \f$. A proof of the latter can be found, e.g., at:
http://en.wikipedia.org/wiki/Sum_of_squares
We estimate the variance
\f$ Var[Y - \boldsymbol c^T \boldsymbol x \mid \boldsymbol x] \f$ as
\f[
\sigma^2 = \frac{RSS}{n - k}
\f]
and compute the t-statistic for coefficient \f$ i \f$ as
\f[
t_i = \frac{c_i}{\sqrt{\sigma^2 \cdot \left( (X^T X)^{-1} \right)_{ii} }}
\,.
\f]
The \f$ p \f$-value for coefficient \f$ i \f$ gives the probability of seeing a
value at least as extreme as the one observed, provided that the null hypothesis
(\f$ c_i = 0 \f$) is true. Letting \f$ F_\nu \f$ denote the
cumulative density function of student-t with \f$ \nu \f$ degrees of freedom,
the \f$ p \f$-value for coefficient \f$ i \f$
is therefore
\f[
p_i = \Pr(|T| \geq |t_i|) = 2 \cdot (1 - F_{n - k}( |t_i| ))
\f]
where \f$ T \f$ is a student-t distributed random variable with mean 0.
The condition number [2] \f$ \kappa(X) = \|X\|_2\cdot\|X^{-1}\|_2\f$ is computed
as the product of two spectral norms [3]. The spectral norm of a matrix \f$X\f$
is the largest singular value of \f$X\f$ i.e. the square root of the largest
eigenvalue of the positive-semidefinite matrix \f$X^{*}X\f$:
\f[
\|X\|_2 = \sqrt{\lambda_{\max}\left(X^{*}X\right)}\ ,
\f]
where \f$X^{*}\f$ is the conjugate transpose of \f$X\f$.
The condition number of a linear regression problem
is a worst-case measure of how sensitive the
result is to small perturbations of the input. A large condition number (say,
more than 1000) indicates the presence of significant multicollinearity.
@anchor literature
@literature
[1] Cosma Shalizi: Statistics 36-350: Data Mining, Lecture Notes, 21 October
2009, http://www.stat.cmu.edu/~cshalizi/350/lectures/17/lecture-17.pdf
[2] Wikipedia: Condition Number, http://en.wikipedia.org/wiki/Condition_number.
[3] Wikipedia: Spectral Norm,
http://en.wikipedia.org/wiki/Spectral_norm#Spectral_norm
[4] Wikipedia: Breusch–Pagan test,
http://en.wikipedia.org/wiki/Breusch%E2%80%93Pagan_test
[5] Wikipedia: Heteroscedasticity-consistent standard errors,
http://en.wikipedia.org/wiki/Heteroscedasticity-consistent_standard_errors
@anchor related
@par Related Topics
@ref grp_robust
@ref grp_clustered_errors
@ref grp_validation
File linear.sql_in, source file for the SQL functions
@internal
@sa Namespace \ref madlib::modules::regress
documenting the implementation in C++
@endinternal
*/
---------------------------------------------------------------------------
-- User facing functions
---------------------------------------------------------------------------
/**
* @brief Linear regression training function with grouping support.
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linregr_train(
source_table VARCHAR, -- name of input table
out_table VARCHAR, -- name of output table
dependent_varname VARCHAR, -- name of dependent variable
independent_varname VARCHAR, -- name of independent variable
grouping_cols VARCHAR, -- names of columns to group-by
heteroskedasticity_option BOOLEAN -- do heteroskedasticity test or not
) RETURNS VOID AS $$
PythonFunction(regress, linear, linregr_train)
$$ LANGUAGE plpythonu
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linregr_train(
source_table VARCHAR, -- name of input table
out_table VARCHAR, -- name of output table
dependent_varname VARCHAR, -- name of dependent variable
independent_varname VARCHAR, -- name of independent variable
grouping_cols VARCHAR -- names of columns to group-by
) RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.linregr_train($1, $2, $3, $4, $5, FALSE);
$$ LANGUAGE sql
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linregr_train(
source_table VARCHAR, -- name of input table
out_table VARCHAR, -- name of output table
dependent_varname VARCHAR, -- name of dependent variable
independent_varname VARCHAR -- name of independent variable
) RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.linregr_train($1, $2, $3, $4, NULL, FALSE);
$$ LANGUAGE sql
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
---------------------------------------------------------------------------
-----------------------------------------------------------------------
-- Online help function
-----------------------------------------------------------------------
----------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linregr_train()
RETURNS VARCHAR AS $$
BEGIN
RETURN MADLIB_SCHEMA.linregr_train('');
END;
$$ LANGUAGE plpgsql IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');
----------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linregr_train(
message VARCHAR -- usage string
)
RETURNS VARCHAR AS $$
PythonFunction(regress, linear, linregr_help_message)
$$ LANGUAGE plpythonu IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');
----------------------------------------------------------------------
-- Deprecated functions --------------------------------------------------------
---------------------------------------------------------------------------
-- Result Types
---------------------------------------------------------------------------
DROP TYPE IF EXISTS MADLIB_SCHEMA.linregr_result CASCADE;
CREATE TYPE MADLIB_SCHEMA.linregr_result AS (
coef DOUBLE PRECISION[],
r2 DOUBLE PRECISION,
std_err DOUBLE PRECISION[],
t_stats DOUBLE PRECISION[],
p_values DOUBLE PRECISION[],
condition_no DOUBLE PRECISION,
num_processed BIGINT,
vcov DOUBLE PRECISION[]
);
DROP TYPE IF EXISTS MADLIB_SCHEMA.heteroskedasticity_test_result CASCADE;
CREATE TYPE MADLIB_SCHEMA.heteroskedasticity_test_result AS (
bp_stats DOUBLE PRECISION,
bp_p_value DOUBLE PRECISION
);
---------------------------------------------------------------------------
---------------------------------------------------------------------------
-- Functions for user-defined aggregates
---------------------------------------------------------------------------
-- normal
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linregr_transition(
state MADLIB_SCHEMA.bytea8,
y DOUBLE PRECISION,
x DOUBLE PRECISION[])
RETURNS MADLIB_SCHEMA.bytea8
AS 'MODULE_PATHNAME'
LANGUAGE C
IMMUTABLE STRICT
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linregr_merge_states(
state1 MADLIB_SCHEMA.bytea8,
state2 MADLIB_SCHEMA.bytea8)
RETURNS MADLIB_SCHEMA.bytea8
AS 'MODULE_PATHNAME'
LANGUAGE C
IMMUTABLE STRICT
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linregr_final(
state MADLIB_SCHEMA.bytea8)
RETURNS MADLIB_SCHEMA.linregr_result
AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE STRICT
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
-- hetero
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.hetero_linregr_transition(
state MADLIB_SCHEMA.bytea8,
y DOUBLE PRECISION,
x DOUBLE PRECISION[],
coef DOUBLE PRECISION[])
RETURNS MADLIB_SCHEMA.bytea8
AS 'MODULE_PATHNAME'
LANGUAGE C
IMMUTABLE STRICT
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.hetero_linregr_merge_states(
state1 MADLIB_SCHEMA.bytea8,
state2 MADLIB_SCHEMA.bytea8)
RETURNS MADLIB_SCHEMA.bytea8
AS 'MODULE_PATHNAME'
LANGUAGE C
IMMUTABLE STRICT
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
-- Final functions
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.hetero_linregr_final(
state MADLIB_SCHEMA.bytea8)
RETURNS MADLIB_SCHEMA.heteroskedasticity_test_result
AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE STRICT
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
---------------------------------------------------------------------------
---------------------------------------------------------------------------
-- User-defined aggregates
---------------------------------------------------------------------------
/**
* @brief Compute linear regression coefficients and diagnostic statistics.
*
* @param dependentVariable Column containing the dependent variable
* @param independentVariables Column containing the array of independent variables
*
* @par
* To include an intercept in the model, set one coordinate in the
* <tt>independentVariables</tt> array to 1.
*
* @return A composite value:
* - <tt>coef FLOAT8[]</tt> - Array of coefficients, \f$ \boldsymbol c \f$
* - <tt>r2 FLOAT8</tt> - Coefficient of determination, \f$ R^2 \f$
* - <tt>std_err FLOAT8[]</tt> - Array of standard errors,
* \f$ \mathit{se}(c_1), \dots, \mathit{se}(c_k) \f$
* - <tt>t_stats FLOAT8[]</tt> - Array of t-statistics, \f$ \boldsymbol t \f$
* - <tt>p_values FLOAT8[]</tt> - Array of p-values, \f$ \boldsymbol p \f$
* - <tt>condition_no FLOAT8</tt> - The condition number of matrix
* \f$ X^T X \f$.
*
* @usage
* - Get vector of coefficients \f$ \boldsymbol c \f$ and all diagnostic
* statistics:\n
* <pre>SELECT (linregr(<em>dependentVariable</em>,
* <em>independentVariables</em>)).*
*FROM <em>sourceName</em>;</pre>
* - Get vector of coefficients \f$ \boldsymbol c \f$:\n
* <pre>SELECT (linregr(<em>dependentVariable</em>,
* <em>independentVariables</em>)).coef
*FROM <em>sourceName</em>;</pre>
* - Get a subset of the output columns, e.g., only the array of coefficients
* \f$ \boldsymbol c \f$, the coefficient of determination \f$ R^2 \f$, and
* the array of p-values \f$ \boldsymbol p \f$:
* <pre>SELECT (lr).coef, (lr).r2, (lr).p_values
*FROM (
* SELECT linregr( <em>dependentVariable</em>,
* <em>independentVariables</em>) AS lr
* FROM <em>sourceName</em>
*) AS subq;</pre>
*/
DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.linregr(DOUBLE PRECISION, DOUBLE PRECISION []);
CREATE AGGREGATE MADLIB_SCHEMA.linregr(
/*+ "dependentVariable" */ DOUBLE PRECISION,
/*+ "independentVariables" */ DOUBLE PRECISION[]) (
SFUNC=MADLIB_SCHEMA.linregr_transition,
STYPE=MADLIB_SCHEMA.bytea8,
FINALFUNC=MADLIB_SCHEMA.linregr_final,
m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.linregr_merge_states,')
INITCOND=''
);
/**
* @brief Compute studentized Breuch-Pagan heteroskedasticity test for
* linear regression.
*
* @param dependentVariable Column containing the dependent variable
* @param independentVariables Column containing the array of independent variables
* @param olsCoefficients Column containing the array of the OLS coefficients (as obtained by linregr)
*
* @par
* To include an intercept in the model, set one coordinate in the
* <tt>independentVariables</tt> array to 1.
*
* @return A composite value:
* - <tt>test_statistic FLOAT8[]</tt> - Prob > test_statistc
* - <tt>p_value FLOAT8[]</tt> - Prob > test_statistc
*
* @usage
* <pre> SELECT (heteoskedasticity_test_linregr(<em>dependentVariable</em>,
* <em>independentVariables</em>, coef)).*
* FROM (
* SELECT linregr(<em>dependentVariable</em>, <em>independentVariables</em>).coef
* ) AS ols_coef, <em>sourceName</em> as src;
* </pre>
*/
DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.heteroskedasticity_test_linregr(
DOUBLE PRECISION, DOUBLE PRECISION [], DOUBLE PRECISION []);
CREATE AGGREGATE MADLIB_SCHEMA.heteroskedasticity_test_linregr(
/*+ "dependentVariable" */ DOUBLE PRECISION,
/*+ "independentVariables" */ DOUBLE PRECISION[],
/*+ "olsCoefficients" */ DOUBLE PRECISION[]) (
SFUNC=MADLIB_SCHEMA.hetero_linregr_transition,
STYPE=MADLIB_SCHEMA.bytea8,
FINALFUNC=MADLIB_SCHEMA.hetero_linregr_final,
m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.hetero_linregr_merge_states,')
INITCOND=''
);
---------------------------------------------------------------------------
/**
* @brief Predict the boolean value of a dependent variable for a specific
* independent variable value in a linear regression model
*
* @param coef Coefficients obtained by running linear regression.
* @param col_ind Independent variable array
* @returns DOUBLE PRECISION Predicted value
*
* This function computes the dot product of the independent variables and the
* coefficients. This requires the length of the two vectors to be the same.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linregr_predict(
coef DOUBLE PRECISION[],
col_ind_var DOUBLE PRECISION[]
) RETURNS DOUBLE PRECISION
AS 'MODULE_PATHNAME', 'array_dot'
LANGUAGE c IMMUTABLE STRICT;
-- Help messages -------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linregr_predict(
message TEXT
) RETURNS TEXT AS $$
PythonFunction(regress, linear, linregr_predict_help_message)
$$ LANGUAGE plpythonu IMMUTABLE;
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linregr_predict()
RETURNS TEXT
AS $$
SELECT MADLIB_SCHEMA.linregr_predict(''::TEXT);
$$
LANGUAGE sql IMMUTABLE;
-------------------------------------------------------------