| |
| m4_include(`SQLCommon.m4') |
| |
| /** |
| @addtogroup grp_clustered_errors |
| |
| @about |
| |
| Adjusting standard errors for clustering can be important. For |
| example, replicating a dataset 100 times should not increase the |
| precision of parameter estimates. However, performing this procedure |
| with the IID assumption will actually do this. Another example is in |
| economics of education research, it is reasonable to expect that the |
| error terms for children in the same class are not |
| independent. Clustering standard errors can correct for this. |
| |
| Assume that the data can be separated into \f$m\f$ clusters. Usually this |
| can be done by grouping the data table according to one or multiple |
| columns. |
| |
| The estimator has a similar form to the usual sandwich estimator |
| \f[ |
| S(\vec{c}) = B(\vec{c}) M(\vec{c}) B(\vec{c}) |
| \f] |
| |
| The bread part is the same as Huber-White sandwich estimator |
| \f{eqnarray}{ |
| B(\vec{c}) & = & \left(-\sum_{i=1}^{n} H(y_i, \vec{x}_i, |
| \vec{c})\right)^{-1}\\ |
| & = & \left(-\sum_{i=1}^{n}\frac{\partial^2 l(y_i, \vec{x}_i, |
| \vec{c})}{\partial c_\alpha \partial c_\beta}\right)^{-1} |
| \f} |
| where \f$H\f$ is the hessian matrix, which is the second derivative of the |
| target function |
| \f[ |
| L(\vec{c}) = \sum_{i=1}^n l(y_i, \vec{x}_i, \vec{c})\ . |
| \f] |
| |
| The meat part is different |
| \f[ |
| M(\vec{c}) = \bf{A}^T\bf{A} |
| \f] |
| where the \f$m\f$-th row of \f$\bf{A}\f$ is |
| \f[ |
| A_m = \sum_{i\in G_m}\frac{\partial |
| l(y_i,\vec{x}_i,\vec{c})}{\partial \vec{c}} |
| \f] |
| where \f$G_m\f$ is the set of rows that belong to the same cluster. |
| |
| We can compute the quantities of \f$B\f$ and \f$A\f$ for each cluster during one scan through |
| the data table in an aggregate function. Then sum over all clusters to |
| the full \f$B\f$ and \f$A\f$ in the outside of the aggregate function. At last, the matrix mulplitications |
| are |
| done in a separate function on the master node. |
| |
| When multinomial logistic regression is computed before the multinomial |
| clustered variance calculation, it uses a default reference category of zero and the regression coefficients are included in the output table. The |
| regression coefficients in the output are in the same order as multinomial |
| logistic regression function, which is described below. |
| For a problem with |
| \f$ K \f$ dependent variables \f$ (1, ..., K) \f$ and \f$ J \f$ categories \f$ (0, ..., J-1) |
| \f$, let \f$ {m_{k,j}} \f$ denote the coefficient for dependent variable \f$ k |
| \f$ and category \f$ j \f$. The output is \f$ {m_{k_1, j_0}, m_{k_1, j_1} |
| \ldots m_{k_1, j_{J-1}}, m_{k_2, j_0}, m_{k_2, j_1} \ldots m_{k_K, j_{J-1}}} \f$. |
| The order is NOT CONSISTENT with the multinomial regression marginal effect |
| calculation with function <em>marginal_mlogregr</em>. This is deliberate |
| because the interfaces of all multinomial regressions (robust, clustered, ...) |
| will be moved to match that used in marginal. |
| |
| @usage |
| |
| \warning The \b 'groupingCol' input parameter for all clustered functions are a |
| placeholder, and the \b 'verbose' parameter is a placeholder for \e clustered_variance_mlogregr. These inputs will be implemented in a future release. |
| |
| <b> The clustered standard errors for linear regression </b> |
| |
| For a quick help message, run the following commands for linear regression |
| <pre> |
| select madlib.clustered_variance_linregr(); |
| select madlib.clustered_variance_linregr('help'); |
| select madlib.clustered_variance_linregr('?'); |
| select madlib.clustered_variance_linregr('usage'); |
| </pre> |
| |
| For logistic regression, run the following commands to get short help messages inside psql |
| <pre> |
| select madlib.clustered_variance_logregr(); |
| select madlib.clustered_variance_logregr('help'); |
| select madlib.clustered_variance_logregr('?'); |
| select madlib.clustered_variance_logregr('usage'); |
| </pre> |
| |
| For multinomial logistic regression, run the following commands to get short help messages inside psql |
| <pre> |
| select madlib.clustered_variance_mlogregr(); |
| select madlib.clustered_variance_mlogregr('help'); |
| select madlib.clustered_variance_mlogregr('?'); |
| select madlib.clustered_variance_mlogregr('usage'); |
| </pre> |
| |
| <pre> |
| SELECT madlib.clustered_variance_linregr ( |
| <em>'tbl_data'</em>, -- Data table name |
| <em>'tbl_output'</em>, -- The result table |
| <em>'depvar'</em>, -- An expression used as dependent variable |
| <em>'indvar'</em>, -- An expression used as independent variable |
| <em>'clustervar'</em>, -- The columns used as the cluster variables, separated by comma |
| <em>'groupingvar'</em> -- The columns used as the grouping variables, separated by comma |
| ); |
| </pre> |
| |
| <pre> |
| SELECT madlib.clustered_variance_logregr ( |
| <em>'tbl_data'</em>, -- Data table name |
| <em>'tbl_output'</em>, -- The result table |
| <em>'depvar'</em>, -- An expression used as dependent variable |
| <em>'indvar'</em>, -- An expression used as independent variable |
| <em>'clustervar'</em>, -- The columns used as the cluster variables, separated by comma |
| <em>'groupingvar'</em>, -- The columns used as the grouping variables, separated by comma |
| <em>max_iter</em>, -- Maximum iteration number for logistic regression, default 20 |
| <em>'optimizer'</em>, -- Optimization method for logistic regression, default 'irls' |
| <em>tolerance</em>, -- When difference of likelihoods in two consecutive iterations smaller than |
| -- this value, stops the computation. Default 0.0001 |
| <em>verbose</em> -- Whether print detailed information when computing logistic regression, |
| -- default is False |
| ); |
| </pre> |
| |
| <pre> |
| SELECT madlib.clustered_variance_mlogregr ( |
| <em>'tbl_data'</em>, -- Data table name |
| <em>'tbl_output'</em>, -- The result table |
| <em>'depvar'</em>, -- An expression used as dependent variable |
| <em>'indvar'</em>, -- An expression used as independent variable |
| <em>'clustervar'</em>, -- The columns used as the cluster variables, separated by comma |
| <em>ref_category</em>, -- Reference category in the range of [0, num_category) |
| <em>'groupingvar'</em>, -- The columns used as the grouping variables, separated by comma |
| <em>max_iter</em>, -- Maximum iteration number for logistic regression, default 20 |
| <em>'optimizer'</em>, -- Optimization method for logistic regression, default 'irls' |
| <em>tolerance</em>, -- When difference of likelihoods in two consecutive iterations smaller than |
| -- this value, stops the computation. Default 0.0001 |
| <em>verbose</em> -- Whether print detailed information when computing logistic regression, |
| -- default is False |
| ); |
| </pre> |
| |
| @examp |
| |
| Note that we need to manually include an intercept term in the independent variable expression. The NULL value of <em>groupingvar</em> means that there is no grouping in the calculation. |
| |
| @verbatim |
| sql> drop table if exists tbl_output; |
| sql> select madlib.clustered_variance_linregr ('abalone', 'tbl_output', 'rings', 'array[1, diameter, length, width]', 'sex', NULL); |
| sql> select * from tbl_output; |
| sql> ---------------------------------------------- |
| sql> drop table if exists tbl_output; |
| sql> select madlib.clustered_variance_logregr ('abalone', 'tbl_output', 'rings < 10', 'array[1, diameter, length, width]', 'sex'); |
| sql> select * from tbl_output; |
| sql> ---------------------------------------------- |
| sql> drop table if exists tbl_output; |
| sql> select madlib.clustered_variance_mlogregr ('abalone', 'tbl_output', 'case when rings < 10 then 1 else 0 end', 'array[1, diameter, length, width]', 'sex', 0); |
| sql> select * from tbl_output; |
| @endverbatim |
| |
| @literature |
| |
| [1] Standard, Robust, and Clustered Standard Errors Computed in R, http://diffuseprior.wordpress.com/2012/06/15/standard-robust-and-clustered-standard-errors-computed-in-r/ |
| |
| @sa File clustered_variance.sql_in documenting the SQL function |
| |
| @internal |
| @sa Namespace \ref madlib::modules::regress |
| documenting the implementation in C++ |
| @endinternal |
| */ |
| |
| ------------------------------------------------------------------------ |
| |
| /** |
| * @brief Compute the clustered errors |
| * |
| * @param tbl_data Data table name |
| * @param tbl_output The result table |
| * @param depvar Dependent variable expression |
| * @param indvar Independent variable expression |
| * @param clustervar The expressions used to clustering |
| * @param grouping_col The names of grouping columns |
| * @param coef Optional, the fitting coefficients |
| */ |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.clustered_variance_linregr ( |
| tbl_data TEXT, |
| tbl_output TEXT, |
| depvar TEXT, |
| indvar TEXT, |
| clustervar TEXT, |
| grouping_col TEXT |
| ) RETURNS VOID AS $$ |
| PythonFunction(regress, clustered_variance, clustered_variance_linregr) |
| $$ LANGUAGE plpythonu; |
| |
| ------------------------------------------------------------------------ |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.clustered_variance_linregr ( |
| tbl_data TEXT, |
| tbl_output TEXT, |
| depvar TEXT, |
| indvar TEXT, |
| clustervar TEXT |
| ) RETURNS VOID AS $$ |
| BEGIN |
| PERFORM MADLIB_SCHEMA.clustered_variance_linregr(tbl_data, tbl_output, depvar, indvar, clustervar, NULL); |
| END; |
| $$ LANGUAGE plpgsql; |
| |
| ------------------------------------------------------------------------ |
| |
| -- print quick help messages |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.clustered_variance_linregr ( |
| ) RETURNS TEXT AS $$ |
| PythonFunction(regress, clustered_variance, clustered_variance_linregr_help) |
| $$ LANGUAGE plpythonu; |
| |
| ------------------------------------------------------------------------ |
| |
| -- print quick help messages |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.clustered_variance_linregr ( |
| msg TEXT |
| ) RETURNS TEXT AS $$ |
| PythonFunction(regress, clustered_variance, clustered_variance_linregr_help) |
| $$ LANGUAGE plpythonu; |
| |
| ------------------------------------------------------------------------ |
| -- Aggregate function -- |
| |
| CREATE TYPE MADLIB_SCHEMA.__clustered_agg_result AS ( |
| meatvec DOUBLE PRECISION[], |
| breadvec DOUBLE PRECISION[] |
| ); |
| |
| ------------------------ |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__clustered_err_lin_transition ( |
| state MADLIB_SCHEMA.bytea8, |
| y DOUBLE PRECISION, |
| x DOUBLE PRECISION[], |
| coef DOUBLE PRECISION[] |
| ) RETURNS MADLIB_SCHEMA.bytea8 AS |
| 'MODULE_PATHNAME' |
| LANGUAGE C IMMUTABLE STRICT; |
| |
| ------------------------ |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__clustered_err_lin_merge ( |
| state1 MADLIB_SCHEMA.bytea8, |
| state2 MADLIB_SCHEMA.bytea8 |
| ) RETURNS MADLIB_SCHEMA.bytea8 AS |
| 'MODULE_PATHNAME' |
| LANGUAGE C IMMUTABLE STRICT; |
| |
| ------------------------ |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__clustered_err_lin_final ( |
| state MADLIB_SCHEMA.bytea8 |
| ) RETURNS MADLIB_SCHEMA.__clustered_agg_result AS |
| 'MODULE_PATHNAME' |
| LANGUAGE C IMMUTABLE STRICT; |
| |
| ------------------------ |
| |
| CREATE AGGREGATE MADLIB_SCHEMA.__clustered_err_lin_step ( |
| /* depvar */ DOUBLE PRECISION, |
| /* indvar */ DOUBLE PRECISION[], |
| /* coef */ DOUBLE PRECISION[]) ( |
| m4_ifdef(`__GREENPLUM__', `prefunc=MADLIB_SCHEMA.__clustered_err_lin_merge,') |
| SFUNC = MADLIB_SCHEMA.__clustered_err_lin_transition, |
| STYPE = MADLIB_SCHEMA.bytea8, |
| FINALFUNC = MADLIB_SCHEMA.__clustered_err_lin_final, |
| INITCOND = '' |
| ); |
| |
| ------------------------------------------------------------------------ |
| |
| CREATE TYPE MADLIB_SCHEMA.__clustered_lin_result AS ( |
| coef DOUBLE PRECISION[], |
| std_err DOUBLE PRECISION[], |
| t_stats DOUBLE PRECISION[], |
| p_values DOUBLE PRECISION[] |
| ); |
| |
| -- Compute the t-stats and p-values |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__clustered_lin_compute_stats ( |
| coef DOUBLE PRECISION[], |
| meatvec DOUBLE PRECISION[], |
| breadvec DOUBLE PRECISION[], |
| mcluster INTEGER, |
| numRows INTEGER |
| ) RETURNS MADLIB_SCHEMA.__clustered_lin_result AS |
| 'MODULE_PATHNAME', 'clustered_lin_compute_stats' |
| LANGUAGE C IMMUTABLE STRICT; |
| |
| ------------------------------------------------------------------------ |
| ------------------------------------------------------------------------ |
| |
| -- Supporting PostgreSQL sum over array |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__array_add ( |
| x DOUBLE PRECISION[], |
| y DOUBLE PRECISION[] |
| ) RETURNS DOUBLE PRECISION[] AS $$ |
| DECLARE |
| a int; |
| b int; |
| i int; |
| res double precision[]; |
| BEGIN |
| res = x; |
| |
| a := array_lower (y, 1); |
| b := array_upper (y, 1); |
| |
| IF a IS NOT NULL THEN |
| FOR i IN a .. b LOOP |
| res[i] := coalesce(res[i],0) + y[i]; |
| END LOOP; |
| END IF; |
| |
| RETURN res; |
| END; |
| $$ LANGUAGE plpgsql STRICT IMMUTABLE; |
| |
| CREATE AGGREGATE MADLIB_SCHEMA.__array_sum ( |
| /* x */ DOUBLE PRECISION[] |
| ) ( |
| SFunc = MADLIB_SCHEMA.__array_add, |
| SType = DOUBLE PRECISION[], |
| m4_ifdef(`__GREENPLUM__', `prefunc = MADLIB_SCHEMA.__array_add,') |
| InitCond = '{}' |
| ); |
| |
| ------------------------------------------------------------------------ |
| ------------------------------------------------------------------------ |
| -- Logistic clustered standard errors |
| ------------------------------------------------------------------------ |
| ------------------------------------------------------------------------ |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.clustered_variance_logregr ( |
| tbl_data TEXT, |
| tbl_output TEXT, |
| depvar TEXT, |
| indvar TEXT, |
| clustervar TEXT, |
| grouping_col TEXT, |
| max_iter INTEGER, |
| optimizer TEXT, |
| tolerance DOUBLE PRECISION, |
| verbose BOOLEAN |
| ) RETURNS VOID AS $$ |
| PythonFunction(regress, clustered_variance, clustered_variance_logregr) |
| $$ LANGUAGE plpythonu; |
| |
| ------------------------------------------------------------------------ |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.clustered_variance_logregr ( |
| tbl_data TEXT, |
| tbl_output TEXT, |
| depvar TEXT, |
| indvar TEXT, |
| clustervar TEXT |
| ) RETURNS VOID AS $$ |
| BEGIN |
| PERFORM MADLIB_SCHEMA.clustered_variance_logregr(tbl_data, tbl_output, depvar, indvar, clustervar, |
| NULL, 20, 'irls', 0.0001, False); |
| END; |
| $$ LANGUAGE plpgsql; |
| |
| ------------------------------------------------------------------------ |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.clustered_variance_logregr ( |
| tbl_data TEXT, |
| tbl_output TEXT, |
| depvar TEXT, |
| indvar TEXT, |
| clustervar TEXT, |
| grouping_col TEXT |
| ) RETURNS VOID AS $$ |
| BEGIN |
| PERFORM MADLIB_SCHEMA.clustered_variance_logregr(tbl_data, tbl_output, depvar, indvar, clustervar, |
| grouping_col, 20, 'irls', 0.0001, False); |
| END; |
| $$ LANGUAGE plpgsql; |
| |
| ------------------------------------------------------------------------ |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.clustered_variance_logregr ( |
| tbl_data TEXT, |
| tbl_output TEXT, |
| depvar TEXT, |
| indvar TEXT, |
| clustervar TEXT, |
| grouping_col TEXT, |
| max_iter INTEGER |
| ) RETURNS VOID AS $$ |
| BEGIN |
| PERFORM MADLIB_SCHEMA.clustered_variance_logregr(tbl_data, tbl_output, depvar, indvar, clustervar, |
| grouping_col, max_iter, 'irls', 0.0001, False); |
| END; |
| $$ LANGUAGE plpgsql; |
| |
| ------------------------------------------------------------------------ |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.clustered_variance_logregr ( |
| tbl_data TEXT, |
| tbl_output TEXT, |
| depvar TEXT, |
| indvar TEXT, |
| clustervar TEXT, |
| grouping_col TEXT, |
| max_iter INTEGER, |
| optimizer TEXT |
| ) RETURNS VOID AS $$ |
| BEGIN |
| PERFORM MADLIB_SCHEMA.clustered_variance_logregr(tbl_data, tbl_output, depvar, indvar, clustervar, |
| grouping_col, max_iter, optimizer, 0.0001, False); |
| END; |
| $$ LANGUAGE plpgsql; |
| |
| ------------------------------------------------------------------------ |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.clustered_variance_logregr ( |
| tbl_data TEXT, |
| tbl_output TEXT, |
| depvar TEXT, |
| indvar TEXT, |
| clustervar TEXT, |
| grouping_col TEXT, |
| max_iter INTEGER, |
| optimizer TEXT, |
| tolerance DOUBLE PRECISION |
| ) RETURNS VOID AS $$ |
| BEGIN |
| PERFORM MADLIB_SCHEMA.clustered_variance_logregr(tbl_data, tbl_output, depvar, indvar, clustervar, |
| grouping_col, max_iter, optimizer, tolerance, False); |
| END; |
| $$ LANGUAGE plpgsql; |
| |
| ------------------------------------------------------------------------ |
| |
| -- print quick help messages |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.clustered_variance_logregr ( |
| ) RETURNS TEXT AS $$ |
| PythonFunction(regress, clustered_variance, clustered_variance_logregr_help) |
| $$ LANGUAGE plpythonu; |
| |
| ------------------------------------------------------------------------ |
| |
| -- print quick help messages |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.clustered_variance_logregr ( |
| msg TEXT |
| ) RETURNS TEXT AS $$ |
| PythonFunction(regress, clustered_variance, clustered_variance_logregr_help) |
| $$ LANGUAGE plpythonu; |
| |
| ------------------------------------------------------------------------ |
| -- Aggregate function -- |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__clustered_err_log_transition ( |
| state MADLIB_SCHEMA.bytea8, |
| y BOOLEAN, |
| x DOUBLE PRECISION[], |
| coef DOUBLE PRECISION[] |
| ) RETURNS MADLIB_SCHEMA.bytea8 AS |
| 'MODULE_PATHNAME' |
| LANGUAGE C IMMUTABLE STRICT; |
| |
| ------------------------ |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__clustered_err_log_merge ( |
| state1 MADLIB_SCHEMA.bytea8, |
| state2 MADLIB_SCHEMA.bytea8 |
| ) RETURNS MADLIB_SCHEMA.bytea8 AS |
| 'MODULE_PATHNAME' |
| LANGUAGE C IMMUTABLE STRICT; |
| |
| ------------------------ |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__clustered_err_log_final ( |
| state MADLIB_SCHEMA.bytea8 |
| ) RETURNS MADLIB_SCHEMA.__clustered_agg_result AS |
| 'MODULE_PATHNAME' |
| LANGUAGE C IMMUTABLE STRICT; |
| |
| ------------------------ |
| |
| CREATE AGGREGATE MADLIB_SCHEMA.__clustered_err_log_step ( |
| /* depvar */ BOOLEAN, |
| /* indvar */ DOUBLE PRECISION[], |
| /* coef */ DOUBLE PRECISION[]) ( |
| m4_ifdef(`__GREENPLUM__', `prefunc=MADLIB_SCHEMA.__clustered_err_log_merge,') |
| SFUNC = MADLIB_SCHEMA.__clustered_err_log_transition, |
| STYPE = MADLIB_SCHEMA.bytea8, |
| FINALFUNC = MADLIB_SCHEMA.__clustered_err_log_final, |
| INITCOND = '' |
| ); |
| |
| ------------------------------------------------------------------------ |
| |
| CREATE TYPE MADLIB_SCHEMA.__clustered_log_result AS ( |
| coef DOUBLE PRECISION[], |
| std_err DOUBLE PRECISION[], |
| z_stats DOUBLE PRECISION[], |
| p_values DOUBLE PRECISION[] |
| ); |
| |
| -- Compute the t-stats and p-values |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__clustered_log_compute_stats ( |
| coef DOUBLE PRECISION[], |
| meatvec DOUBLE PRECISION[], |
| breadvec DOUBLE PRECISION[], |
| mcluster INTEGER, |
| numRows INTEGER |
| ) RETURNS MADLIB_SCHEMA.__clustered_log_result AS |
| 'MODULE_PATHNAME', 'clustered_log_compute_stats' |
| LANGUAGE C IMMUTABLE STRICT; |
| |
| |
| ------------------------------------------------------------------------ |
| ------------------------------------------------------------------------ |
| -- Multi-Logistic clustered standard errors |
| ------------------------------------------------------------------------ |
| ------------------------------------------------------------------------ |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.clustered_variance_mlogregr ( |
| tbl_data TEXT, |
| tbl_output TEXT, |
| depvar TEXT, |
| indvar TEXT, |
| clustervar TEXT, |
| ref_category INTEGER, |
| grouping_col TEXT, |
| max_iter INTEGER, |
| optimizer TEXT, |
| tolerance DOUBLE PRECISION, |
| verbose BOOLEAN |
| ) RETURNS VOID AS $$ |
| PythonFunction(regress, clustered_variance, clustered_variance_mlogregr) |
| $$ LANGUAGE plpythonu; |
| |
| ------------------------------------------------------------------------ |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.clustered_variance_mlogregr ( |
| tbl_data TEXT, |
| tbl_output TEXT, |
| depvar TEXT, |
| indvar TEXT, |
| clustervar TEXT |
| ) RETURNS VOID AS $$ |
| BEGIN |
| PERFORM MADLIB_SCHEMA.clustered_variance_mlogregr(tbl_data, tbl_output, depvar, indvar, clustervar, 0, |
| NULL, 20, 'irls', 0.0001, False); |
| END; |
| $$ LANGUAGE plpgsql; |
| |
| ------------------------------------------------------------------------ |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.clustered_variance_mlogregr ( |
| tbl_data TEXT, |
| tbl_output TEXT, |
| depvar TEXT, |
| indvar TEXT, |
| clustervar TEXT, |
| ref_category INTEGER |
| ) RETURNS VOID AS $$ |
| BEGIN |
| PERFORM MADLIB_SCHEMA.clustered_variance_mlogregr(tbl_data, tbl_output, depvar, indvar, clustervar, ref_category, |
| NULL, 20, 'irls', 0.0001, False); |
| END; |
| $$ LANGUAGE plpgsql; |
| |
| ------------------------------------------------------------------------ |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.clustered_variance_mlogregr ( |
| tbl_data TEXT, |
| tbl_output TEXT, |
| depvar TEXT, |
| indvar TEXT, |
| clustervar TEXT, |
| ref_category INTEGER, |
| grouping_col TEXT |
| ) RETURNS VOID AS $$ |
| BEGIN |
| PERFORM MADLIB_SCHEMA.clustered_variance_mlogregr(tbl_data, tbl_output, depvar, indvar, clustervar, ref_category, |
| grouping_col, 20, 'irls', 0.0001, False); |
| END; |
| $$ LANGUAGE plpgsql; |
| |
| ------------------------------------------------------------------------ |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.clustered_variance_mlogregr ( |
| tbl_data TEXT, |
| tbl_output TEXT, |
| depvar TEXT, |
| indvar TEXT, |
| clustervar TEXT, |
| ref_category INTEGER, |
| grouping_col TEXT, |
| max_iter INTEGER |
| ) RETURNS VOID AS $$ |
| BEGIN |
| PERFORM MADLIB_SCHEMA.clustered_variance_mlogregr(tbl_data, tbl_output, depvar, indvar, clustervar, ref_category, |
| grouping_col, max_iter, 'irls', 0.0001, False); |
| END; |
| $$ LANGUAGE plpgsql; |
| |
| ------------------------------------------------------------------------ |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.clustered_variance_mlogregr ( |
| tbl_data TEXT, |
| tbl_output TEXT, |
| depvar TEXT, |
| indvar TEXT, |
| clustervar TEXT, |
| ref_category INTEGER, |
| grouping_col TEXT, |
| max_iter INTEGER, |
| optimizer TEXT |
| ) RETURNS VOID AS $$ |
| BEGIN |
| PERFORM MADLIB_SCHEMA.clustered_variance_mlogregr(tbl_data, tbl_output, depvar, indvar, clustervar, ref_category, |
| grouping_col, max_iter, optimizer, 0.0001, False); |
| END; |
| $$ LANGUAGE plpgsql; |
| |
| ------------------------------------------------------------------------ |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.clustered_variance_mlogregr ( |
| tbl_data TEXT, |
| tbl_output TEXT, |
| depvar TEXT, |
| indvar TEXT, |
| clustervar TEXT, |
| ref_category INTEGER, |
| grouping_col TEXT, |
| max_iter INTEGER, |
| optimizer TEXT, |
| tolerance DOUBLE PRECISION |
| ) RETURNS VOID AS $$ |
| BEGIN |
| PERFORM MADLIB_SCHEMA.clustered_variance_mlogregr(tbl_data, tbl_output, depvar, indvar, clustervar, ref_category, |
| grouping_col, max_iter, optimizer, tolerance, False); |
| END; |
| $$ LANGUAGE plpgsql; |
| |
| ------------------------------------------------------------------------ |
| |
| -- print quick help messages |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.clustered_variance_mlogregr ( |
| ) RETURNS TEXT AS $$ |
| PythonFunction(regress, clustered_variance, clustered_variance_mlogregr_help) |
| $$ LANGUAGE plpythonu; |
| |
| ------------------------------------------------------------------------ |
| |
| -- print quick help messages |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.clustered_variance_mlogregr ( |
| msg TEXT |
| ) RETURNS TEXT AS $$ |
| PythonFunction(regress, clustered_variance, clustered_variance_mlogregr_help) |
| $$ LANGUAGE plpythonu; |
| |
| ------------------------------------------------------------------------ |
| -- Aggregate function -- |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__clustered_err_mlog_transition ( |
| state MADLIB_SCHEMA.bytea8, |
| y INTEGER, |
| x DOUBLE PRECISION[], |
| coef DOUBLE PRECISION[], |
| num_cat INTEGER, |
| ref_cat INTEGER |
| ) RETURNS MADLIB_SCHEMA.bytea8 AS |
| 'MODULE_PATHNAME' |
| LANGUAGE C IMMUTABLE STRICT; |
| |
| ------------------------ |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__clustered_err_mlog_merge ( |
| state1 MADLIB_SCHEMA.bytea8, |
| state2 MADLIB_SCHEMA.bytea8 |
| ) RETURNS MADLIB_SCHEMA.bytea8 AS |
| 'MODULE_PATHNAME' |
| LANGUAGE C IMMUTABLE STRICT; |
| |
| ------------------------ |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__clustered_err_mlog_final ( |
| state MADLIB_SCHEMA.bytea8 |
| ) RETURNS MADLIB_SCHEMA.__clustered_agg_result AS |
| 'MODULE_PATHNAME' |
| LANGUAGE C IMMUTABLE STRICT; |
| |
| ------------------------ |
| |
| CREATE AGGREGATE MADLIB_SCHEMA.__clustered_err_mlog_step ( |
| /* depvar */ INTEGER, |
| /* indvar */ DOUBLE PRECISION[], |
| /* coef */ DOUBLE PRECISION[], |
| /* num_cat */ INTEGER, |
| /* reference_cat */ INTEGER) ( |
| m4_ifdef(`__GREENPLUM__', `prefunc=MADLIB_SCHEMA.__clustered_err_mlog_merge,') |
| SFUNC = MADLIB_SCHEMA.__clustered_err_mlog_transition, |
| STYPE = MADLIB_SCHEMA.bytea8, |
| FINALFUNC = MADLIB_SCHEMA.__clustered_err_mlog_final, |
| INITCOND = '' |
| ); |
| |
| ------------------------------------------------------------------------ |
| |
| CREATE TYPE MADLIB_SCHEMA.__clustered_mlog_result AS ( |
| coef DOUBLE PRECISION[], |
| std_err DOUBLE PRECISION[], |
| z_stats DOUBLE PRECISION[], |
| p_values DOUBLE PRECISION[] |
| ); |
| |
| -- Compute the t-stats and p-values |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__clustered_mlog_compute_stats ( |
| coef DOUBLE PRECISION[], |
| meatvec DOUBLE PRECISION[], |
| breadvec DOUBLE PRECISION[], |
| mcluster INTEGER, |
| numRows INTEGER |
| ) RETURNS MADLIB_SCHEMA.__clustered_mlog_result AS |
| 'MODULE_PATHNAME', 'clustered_mlog_compute_stats' |
| LANGUAGE C IMMUTABLE STRICT; |