blob: 577d2e09b7152b919c9affc7d6f00b6a2d217c42 [file] [log] [blame]
m4_include(`SQLCommon.m4')
/**
@addtogroup grp_clustered_errors
@about
Adjusting standard errors for clustering can be important. For
example, replicating a dataset 100 times should not increase the
precision of parameter estimates. However, performing this procedure
with the IID assumption will actually do this. Another example is in
economics of education research, it is reasonable to expect that the
error terms for children in the same class are not
independent. Clustering standard errors can correct for this.
Assume that the data can be separated into \f$m\f$ clusters. Usually this
can be done by grouping the data table according to one or multiple
columns.
The estimator has a similar form to the usual sandwich estimator
\f[
S(\vec{c}) = B(\vec{c}) M(\vec{c}) B(\vec{c})
\f]
The bread part is the same as Huber-White sandwich estimator
\f{eqnarray}{
B(\vec{c}) & = & \left(-\sum_{i=1}^{n} H(y_i, \vec{x}_i,
\vec{c})\right)^{-1}\\
& = & \left(-\sum_{i=1}^{n}\frac{\partial^2 l(y_i, \vec{x}_i,
\vec{c})}{\partial c_\alpha \partial c_\beta}\right)^{-1}
\f}
where \f$H\f$ is the hessian matrix, which is the second derivative of the
target function
\f[
L(\vec{c}) = \sum_{i=1}^n l(y_i, \vec{x}_i, \vec{c})\ .
\f]
The meat part is different
\f[
M(\vec{c}) = \bf{A}^T\bf{A}
\f]
where the \f$m\f$-th row of \f$\bf{A}\f$ is
\f[
A_m = \sum_{i\in G_m}\frac{\partial
l(y_i,\vec{x}_i,\vec{c})}{\partial \vec{c}}
\f]
where \f$G_m\f$ is the set of rows that belong to the same cluster.
We can compute the quantities of \f$B\f$ and \f$A\f$ for each cluster during one scan through
the data table in an aggregate function. Then sum over all clusters to
the full \f$B\f$ and \f$A\f$ in the outside of the aggregate function. At last, the matrix mulplitications
are
done in a separate function on the master node.
When multinomial logistic regression is computed before the multinomial
clustered variance calculation, it uses a default reference category of zero and the regression coefficients are included in the output table. The
regression coefficients in the output are in the same order as multinomial
logistic regression function, which is described below.
For a problem with
\f$ K \f$ dependent variables \f$ (1, ..., K) \f$ and \f$ J \f$ categories \f$ (0, ..., J-1)
\f$, let \f$ {m_{k,j}} \f$ denote the coefficient for dependent variable \f$ k
\f$ and category \f$ j \f$. The output is \f$ {m_{k_1, j_0}, m_{k_1, j_1}
\ldots m_{k_1, j_{J-1}}, m_{k_2, j_0}, m_{k_2, j_1} \ldots m_{k_K, j_{J-1}}} \f$.
The order is NOT CONSISTENT with the multinomial regression marginal effect
calculation with function <em>marginal_mlogregr</em>. This is deliberate
because the interfaces of all multinomial regressions (robust, clustered, ...)
will be moved to match that used in marginal.
@usage
\warning The \b 'groupingCol' input parameter for all clustered functions are a
placeholder, and the \b 'verbose' parameter is a placeholder for \e clustered_variance_mlogregr. These inputs will be implemented in a future release.
<b> The clustered standard errors for linear regression </b>
For a quick help message, run the following commands for linear regression
<pre>
select madlib.clustered_variance_linregr();
select madlib.clustered_variance_linregr('help');
select madlib.clustered_variance_linregr('?');
select madlib.clustered_variance_linregr('usage');
</pre>
For logistic regression, run the following commands to get short help messages inside psql
<pre>
select madlib.clustered_variance_logregr();
select madlib.clustered_variance_logregr('help');
select madlib.clustered_variance_logregr('?');
select madlib.clustered_variance_logregr('usage');
</pre>
For multinomial logistic regression, run the following commands to get short help messages inside psql
<pre>
select madlib.clustered_variance_mlogregr();
select madlib.clustered_variance_mlogregr('help');
select madlib.clustered_variance_mlogregr('?');
select madlib.clustered_variance_mlogregr('usage');
</pre>
<pre>
SELECT madlib.clustered_variance_linregr (
<em>'tbl_data'</em>, -- Data table name
<em>'tbl_output'</em>, -- The result table
<em>'depvar'</em>, -- An expression used as dependent variable
<em>'indvar'</em>, -- An expression used as independent variable
<em>'clustervar'</em>, -- The columns used as the cluster variables, separated by comma
<em>'groupingvar'</em> -- The columns used as the grouping variables, separated by comma
);
</pre>
<pre>
SELECT madlib.clustered_variance_logregr (
<em>'tbl_data'</em>, -- Data table name
<em>'tbl_output'</em>, -- The result table
<em>'depvar'</em>, -- An expression used as dependent variable
<em>'indvar'</em>, -- An expression used as independent variable
<em>'clustervar'</em>, -- The columns used as the cluster variables, separated by comma
<em>'groupingvar'</em>, -- The columns used as the grouping variables, separated by comma
<em>max_iter</em>, -- Maximum iteration number for logistic regression, default 20
<em>'optimizer'</em>, -- Optimization method for logistic regression, default 'irls'
<em>tolerance</em>, -- When difference of likelihoods in two consecutive iterations smaller than
-- this value, stops the computation. Default 0.0001
<em>verbose</em> -- Whether print detailed information when computing logistic regression,
-- default is False
);
</pre>
<pre>
SELECT madlib.clustered_variance_mlogregr (
<em>'tbl_data'</em>, -- Data table name
<em>'tbl_output'</em>, -- The result table
<em>'depvar'</em>, -- An expression used as dependent variable
<em>'indvar'</em>, -- An expression used as independent variable
<em>'clustervar'</em>, -- The columns used as the cluster variables, separated by comma
<em>ref_category</em>, -- Reference category in the range of [0, num_category)
<em>'groupingvar'</em>, -- The columns used as the grouping variables, separated by comma
<em>max_iter</em>, -- Maximum iteration number for logistic regression, default 20
<em>'optimizer'</em>, -- Optimization method for logistic regression, default 'irls'
<em>tolerance</em>, -- When difference of likelihoods in two consecutive iterations smaller than
-- this value, stops the computation. Default 0.0001
<em>verbose</em> -- Whether print detailed information when computing logistic regression,
-- default is False
);
</pre>
@examp
Note that we need to manually include an intercept term in the independent variable expression. The NULL value of <em>groupingvar</em> means that there is no grouping in the calculation.
@verbatim
sql> drop table if exists tbl_output;
sql> select madlib.clustered_variance_linregr ('abalone', 'tbl_output', 'rings', 'array[1, diameter, length, width]', 'sex', NULL);
sql> select * from tbl_output;
sql> ----------------------------------------------
sql> drop table if exists tbl_output;
sql> select madlib.clustered_variance_logregr ('abalone', 'tbl_output', 'rings < 10', 'array[1, diameter, length, width]', 'sex');
sql> select * from tbl_output;
sql> ----------------------------------------------
sql> drop table if exists tbl_output;
sql> select madlib.clustered_variance_mlogregr ('abalone', 'tbl_output', 'case when rings < 10 then 1 else 0 end', 'array[1, diameter, length, width]', 'sex', 0);
sql> select * from tbl_output;
@endverbatim
@literature
[1] Standard, Robust, and Clustered Standard Errors Computed in R, http://diffuseprior.wordpress.com/2012/06/15/standard-robust-and-clustered-standard-errors-computed-in-r/
@sa File clustered_variance.sql_in documenting the SQL function
@internal
@sa Namespace \ref madlib::modules::regress
documenting the implementation in C++
@endinternal
*/
------------------------------------------------------------------------
/**
* @brief Compute the clustered errors
*
* @param tbl_data Data table name
* @param tbl_output The result table
* @param depvar Dependent variable expression
* @param indvar Independent variable expression
* @param clustervar The expressions used to clustering
* @param grouping_col The names of grouping columns
* @param coef Optional, the fitting coefficients
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.clustered_variance_linregr (
tbl_data TEXT,
tbl_output TEXT,
depvar TEXT,
indvar TEXT,
clustervar TEXT,
grouping_col TEXT
) RETURNS VOID AS $$
PythonFunction(regress, clustered_variance, clustered_variance_linregr)
$$ LANGUAGE plpythonu;
------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.clustered_variance_linregr (
tbl_data TEXT,
tbl_output TEXT,
depvar TEXT,
indvar TEXT,
clustervar TEXT
) RETURNS VOID AS $$
BEGIN
PERFORM MADLIB_SCHEMA.clustered_variance_linregr(tbl_data, tbl_output, depvar, indvar, clustervar, NULL);
END;
$$ LANGUAGE plpgsql;
------------------------------------------------------------------------
-- print quick help messages
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.clustered_variance_linregr (
) RETURNS TEXT AS $$
PythonFunction(regress, clustered_variance, clustered_variance_linregr_help)
$$ LANGUAGE plpythonu;
------------------------------------------------------------------------
-- print quick help messages
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.clustered_variance_linregr (
msg TEXT
) RETURNS TEXT AS $$
PythonFunction(regress, clustered_variance, clustered_variance_linregr_help)
$$ LANGUAGE plpythonu;
------------------------------------------------------------------------
-- Aggregate function --
CREATE TYPE MADLIB_SCHEMA.__clustered_agg_result AS (
meatvec DOUBLE PRECISION[],
breadvec DOUBLE PRECISION[]
);
------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__clustered_err_lin_transition (
state MADLIB_SCHEMA.bytea8,
y DOUBLE PRECISION,
x DOUBLE PRECISION[],
coef DOUBLE PRECISION[]
) RETURNS MADLIB_SCHEMA.bytea8 AS
'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE STRICT;
------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__clustered_err_lin_merge (
state1 MADLIB_SCHEMA.bytea8,
state2 MADLIB_SCHEMA.bytea8
) RETURNS MADLIB_SCHEMA.bytea8 AS
'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE STRICT;
------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__clustered_err_lin_final (
state MADLIB_SCHEMA.bytea8
) RETURNS MADLIB_SCHEMA.__clustered_agg_result AS
'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE STRICT;
------------------------
CREATE AGGREGATE MADLIB_SCHEMA.__clustered_err_lin_step (
/* depvar */ DOUBLE PRECISION,
/* indvar */ DOUBLE PRECISION[],
/* coef */ DOUBLE PRECISION[]) (
m4_ifdef(`__GREENPLUM__', `prefunc=MADLIB_SCHEMA.__clustered_err_lin_merge,')
SFUNC = MADLIB_SCHEMA.__clustered_err_lin_transition,
STYPE = MADLIB_SCHEMA.bytea8,
FINALFUNC = MADLIB_SCHEMA.__clustered_err_lin_final,
INITCOND = ''
);
------------------------------------------------------------------------
CREATE TYPE MADLIB_SCHEMA.__clustered_lin_result AS (
coef DOUBLE PRECISION[],
std_err DOUBLE PRECISION[],
t_stats DOUBLE PRECISION[],
p_values DOUBLE PRECISION[]
);
-- Compute the t-stats and p-values
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__clustered_lin_compute_stats (
coef DOUBLE PRECISION[],
meatvec DOUBLE PRECISION[],
breadvec DOUBLE PRECISION[],
mcluster INTEGER,
numRows INTEGER
) RETURNS MADLIB_SCHEMA.__clustered_lin_result AS
'MODULE_PATHNAME', 'clustered_lin_compute_stats'
LANGUAGE C IMMUTABLE STRICT;
------------------------------------------------------------------------
------------------------------------------------------------------------
-- Supporting PostgreSQL sum over array
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__array_add (
x DOUBLE PRECISION[],
y DOUBLE PRECISION[]
) RETURNS DOUBLE PRECISION[] AS $$
DECLARE
a int;
b int;
i int;
res double precision[];
BEGIN
res = x;
a := array_lower (y, 1);
b := array_upper (y, 1);
IF a IS NOT NULL THEN
FOR i IN a .. b LOOP
res[i] := coalesce(res[i],0) + y[i];
END LOOP;
END IF;
RETURN res;
END;
$$ LANGUAGE plpgsql STRICT IMMUTABLE;
CREATE AGGREGATE MADLIB_SCHEMA.__array_sum (
/* x */ DOUBLE PRECISION[]
) (
SFunc = MADLIB_SCHEMA.__array_add,
SType = DOUBLE PRECISION[],
m4_ifdef(`__GREENPLUM__', `prefunc = MADLIB_SCHEMA.__array_add,')
InitCond = '{}'
);
------------------------------------------------------------------------
------------------------------------------------------------------------
-- Logistic clustered standard errors
------------------------------------------------------------------------
------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.clustered_variance_logregr (
tbl_data TEXT,
tbl_output TEXT,
depvar TEXT,
indvar TEXT,
clustervar TEXT,
grouping_col TEXT,
max_iter INTEGER,
optimizer TEXT,
tolerance DOUBLE PRECISION,
verbose BOOLEAN
) RETURNS VOID AS $$
PythonFunction(regress, clustered_variance, clustered_variance_logregr)
$$ LANGUAGE plpythonu;
------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.clustered_variance_logregr (
tbl_data TEXT,
tbl_output TEXT,
depvar TEXT,
indvar TEXT,
clustervar TEXT
) RETURNS VOID AS $$
BEGIN
PERFORM MADLIB_SCHEMA.clustered_variance_logregr(tbl_data, tbl_output, depvar, indvar, clustervar,
NULL, 20, 'irls', 0.0001, False);
END;
$$ LANGUAGE plpgsql;
------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.clustered_variance_logregr (
tbl_data TEXT,
tbl_output TEXT,
depvar TEXT,
indvar TEXT,
clustervar TEXT,
grouping_col TEXT
) RETURNS VOID AS $$
BEGIN
PERFORM MADLIB_SCHEMA.clustered_variance_logregr(tbl_data, tbl_output, depvar, indvar, clustervar,
grouping_col, 20, 'irls', 0.0001, False);
END;
$$ LANGUAGE plpgsql;
------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.clustered_variance_logregr (
tbl_data TEXT,
tbl_output TEXT,
depvar TEXT,
indvar TEXT,
clustervar TEXT,
grouping_col TEXT,
max_iter INTEGER
) RETURNS VOID AS $$
BEGIN
PERFORM MADLIB_SCHEMA.clustered_variance_logregr(tbl_data, tbl_output, depvar, indvar, clustervar,
grouping_col, max_iter, 'irls', 0.0001, False);
END;
$$ LANGUAGE plpgsql;
------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.clustered_variance_logregr (
tbl_data TEXT,
tbl_output TEXT,
depvar TEXT,
indvar TEXT,
clustervar TEXT,
grouping_col TEXT,
max_iter INTEGER,
optimizer TEXT
) RETURNS VOID AS $$
BEGIN
PERFORM MADLIB_SCHEMA.clustered_variance_logregr(tbl_data, tbl_output, depvar, indvar, clustervar,
grouping_col, max_iter, optimizer, 0.0001, False);
END;
$$ LANGUAGE plpgsql;
------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.clustered_variance_logregr (
tbl_data TEXT,
tbl_output TEXT,
depvar TEXT,
indvar TEXT,
clustervar TEXT,
grouping_col TEXT,
max_iter INTEGER,
optimizer TEXT,
tolerance DOUBLE PRECISION
) RETURNS VOID AS $$
BEGIN
PERFORM MADLIB_SCHEMA.clustered_variance_logregr(tbl_data, tbl_output, depvar, indvar, clustervar,
grouping_col, max_iter, optimizer, tolerance, False);
END;
$$ LANGUAGE plpgsql;
------------------------------------------------------------------------
-- print quick help messages
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.clustered_variance_logregr (
) RETURNS TEXT AS $$
PythonFunction(regress, clustered_variance, clustered_variance_logregr_help)
$$ LANGUAGE plpythonu;
------------------------------------------------------------------------
-- print quick help messages
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.clustered_variance_logregr (
msg TEXT
) RETURNS TEXT AS $$
PythonFunction(regress, clustered_variance, clustered_variance_logregr_help)
$$ LANGUAGE plpythonu;
------------------------------------------------------------------------
-- Aggregate function --
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__clustered_err_log_transition (
state MADLIB_SCHEMA.bytea8,
y BOOLEAN,
x DOUBLE PRECISION[],
coef DOUBLE PRECISION[]
) RETURNS MADLIB_SCHEMA.bytea8 AS
'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE STRICT;
------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__clustered_err_log_merge (
state1 MADLIB_SCHEMA.bytea8,
state2 MADLIB_SCHEMA.bytea8
) RETURNS MADLIB_SCHEMA.bytea8 AS
'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE STRICT;
------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__clustered_err_log_final (
state MADLIB_SCHEMA.bytea8
) RETURNS MADLIB_SCHEMA.__clustered_agg_result AS
'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE STRICT;
------------------------
CREATE AGGREGATE MADLIB_SCHEMA.__clustered_err_log_step (
/* depvar */ BOOLEAN,
/* indvar */ DOUBLE PRECISION[],
/* coef */ DOUBLE PRECISION[]) (
m4_ifdef(`__GREENPLUM__', `prefunc=MADLIB_SCHEMA.__clustered_err_log_merge,')
SFUNC = MADLIB_SCHEMA.__clustered_err_log_transition,
STYPE = MADLIB_SCHEMA.bytea8,
FINALFUNC = MADLIB_SCHEMA.__clustered_err_log_final,
INITCOND = ''
);
------------------------------------------------------------------------
CREATE TYPE MADLIB_SCHEMA.__clustered_log_result AS (
coef DOUBLE PRECISION[],
std_err DOUBLE PRECISION[],
z_stats DOUBLE PRECISION[],
p_values DOUBLE PRECISION[]
);
-- Compute the t-stats and p-values
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__clustered_log_compute_stats (
coef DOUBLE PRECISION[],
meatvec DOUBLE PRECISION[],
breadvec DOUBLE PRECISION[],
mcluster INTEGER,
numRows INTEGER
) RETURNS MADLIB_SCHEMA.__clustered_log_result AS
'MODULE_PATHNAME', 'clustered_log_compute_stats'
LANGUAGE C IMMUTABLE STRICT;
------------------------------------------------------------------------
------------------------------------------------------------------------
-- Multi-Logistic clustered standard errors
------------------------------------------------------------------------
------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.clustered_variance_mlogregr (
tbl_data TEXT,
tbl_output TEXT,
depvar TEXT,
indvar TEXT,
clustervar TEXT,
ref_category INTEGER,
grouping_col TEXT,
max_iter INTEGER,
optimizer TEXT,
tolerance DOUBLE PRECISION,
verbose BOOLEAN
) RETURNS VOID AS $$
PythonFunction(regress, clustered_variance, clustered_variance_mlogregr)
$$ LANGUAGE plpythonu;
------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.clustered_variance_mlogregr (
tbl_data TEXT,
tbl_output TEXT,
depvar TEXT,
indvar TEXT,
clustervar TEXT
) RETURNS VOID AS $$
BEGIN
PERFORM MADLIB_SCHEMA.clustered_variance_mlogregr(tbl_data, tbl_output, depvar, indvar, clustervar, 0,
NULL, 20, 'irls', 0.0001, False);
END;
$$ LANGUAGE plpgsql;
------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.clustered_variance_mlogregr (
tbl_data TEXT,
tbl_output TEXT,
depvar TEXT,
indvar TEXT,
clustervar TEXT,
ref_category INTEGER
) RETURNS VOID AS $$
BEGIN
PERFORM MADLIB_SCHEMA.clustered_variance_mlogregr(tbl_data, tbl_output, depvar, indvar, clustervar, ref_category,
NULL, 20, 'irls', 0.0001, False);
END;
$$ LANGUAGE plpgsql;
------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.clustered_variance_mlogregr (
tbl_data TEXT,
tbl_output TEXT,
depvar TEXT,
indvar TEXT,
clustervar TEXT,
ref_category INTEGER,
grouping_col TEXT
) RETURNS VOID AS $$
BEGIN
PERFORM MADLIB_SCHEMA.clustered_variance_mlogregr(tbl_data, tbl_output, depvar, indvar, clustervar, ref_category,
grouping_col, 20, 'irls', 0.0001, False);
END;
$$ LANGUAGE plpgsql;
------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.clustered_variance_mlogregr (
tbl_data TEXT,
tbl_output TEXT,
depvar TEXT,
indvar TEXT,
clustervar TEXT,
ref_category INTEGER,
grouping_col TEXT,
max_iter INTEGER
) RETURNS VOID AS $$
BEGIN
PERFORM MADLIB_SCHEMA.clustered_variance_mlogregr(tbl_data, tbl_output, depvar, indvar, clustervar, ref_category,
grouping_col, max_iter, 'irls', 0.0001, False);
END;
$$ LANGUAGE plpgsql;
------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.clustered_variance_mlogregr (
tbl_data TEXT,
tbl_output TEXT,
depvar TEXT,
indvar TEXT,
clustervar TEXT,
ref_category INTEGER,
grouping_col TEXT,
max_iter INTEGER,
optimizer TEXT
) RETURNS VOID AS $$
BEGIN
PERFORM MADLIB_SCHEMA.clustered_variance_mlogregr(tbl_data, tbl_output, depvar, indvar, clustervar, ref_category,
grouping_col, max_iter, optimizer, 0.0001, False);
END;
$$ LANGUAGE plpgsql;
------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.clustered_variance_mlogregr (
tbl_data TEXT,
tbl_output TEXT,
depvar TEXT,
indvar TEXT,
clustervar TEXT,
ref_category INTEGER,
grouping_col TEXT,
max_iter INTEGER,
optimizer TEXT,
tolerance DOUBLE PRECISION
) RETURNS VOID AS $$
BEGIN
PERFORM MADLIB_SCHEMA.clustered_variance_mlogregr(tbl_data, tbl_output, depvar, indvar, clustervar, ref_category,
grouping_col, max_iter, optimizer, tolerance, False);
END;
$$ LANGUAGE plpgsql;
------------------------------------------------------------------------
-- print quick help messages
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.clustered_variance_mlogregr (
) RETURNS TEXT AS $$
PythonFunction(regress, clustered_variance, clustered_variance_mlogregr_help)
$$ LANGUAGE plpythonu;
------------------------------------------------------------------------
-- print quick help messages
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.clustered_variance_mlogregr (
msg TEXT
) RETURNS TEXT AS $$
PythonFunction(regress, clustered_variance, clustered_variance_mlogregr_help)
$$ LANGUAGE plpythonu;
------------------------------------------------------------------------
-- Aggregate function --
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__clustered_err_mlog_transition (
state MADLIB_SCHEMA.bytea8,
y INTEGER,
x DOUBLE PRECISION[],
coef DOUBLE PRECISION[],
num_cat INTEGER,
ref_cat INTEGER
) RETURNS MADLIB_SCHEMA.bytea8 AS
'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE STRICT;
------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__clustered_err_mlog_merge (
state1 MADLIB_SCHEMA.bytea8,
state2 MADLIB_SCHEMA.bytea8
) RETURNS MADLIB_SCHEMA.bytea8 AS
'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE STRICT;
------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__clustered_err_mlog_final (
state MADLIB_SCHEMA.bytea8
) RETURNS MADLIB_SCHEMA.__clustered_agg_result AS
'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE STRICT;
------------------------
CREATE AGGREGATE MADLIB_SCHEMA.__clustered_err_mlog_step (
/* depvar */ INTEGER,
/* indvar */ DOUBLE PRECISION[],
/* coef */ DOUBLE PRECISION[],
/* num_cat */ INTEGER,
/* reference_cat */ INTEGER) (
m4_ifdef(`__GREENPLUM__', `prefunc=MADLIB_SCHEMA.__clustered_err_mlog_merge,')
SFUNC = MADLIB_SCHEMA.__clustered_err_mlog_transition,
STYPE = MADLIB_SCHEMA.bytea8,
FINALFUNC = MADLIB_SCHEMA.__clustered_err_mlog_final,
INITCOND = ''
);
------------------------------------------------------------------------
CREATE TYPE MADLIB_SCHEMA.__clustered_mlog_result AS (
coef DOUBLE PRECISION[],
std_err DOUBLE PRECISION[],
z_stats DOUBLE PRECISION[],
p_values DOUBLE PRECISION[]
);
-- Compute the t-stats and p-values
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__clustered_mlog_compute_stats (
coef DOUBLE PRECISION[],
meatvec DOUBLE PRECISION[],
breadvec DOUBLE PRECISION[],
mcluster INTEGER,
numRows INTEGER
) RETURNS MADLIB_SCHEMA.__clustered_mlog_result AS
'MODULE_PATHNAME', 'clustered_mlog_compute_stats'
LANGUAGE C IMMUTABLE STRICT;