blob: 6060d491eea1064e5c01d37d859c88095e4fe35f [file] [log] [blame]
/* ----------------------------------------------------------------------- *//**
*
* @file robust.sql_in
*
* @brief SQL functions for linear regression
* @date January 2011
*
* @sa Calculates robust statistics for various regression models.
*
*//* ----------------------------------------------------------------------- */
m4_include(`SQLCommon.m4')
/**
@addtogroup grp_robust
@about
When doing regression analysis, we are sometimes interested in the variance of the computed coefficients \f$ \boldsymbol c \f$. While the built-in regression functions provide variance estimates, we may prefer a <i> robust </i> variance estimate.
The robust variance calculation can be expressed in a sandwich formation, which is the form
\f[
S( \boldsymbol c) = B( \boldsymbol c) M( \boldsymbol c) B( \boldsymbol c)
\f]
where \f$ B( \boldsymbol c)\f$ and \f$ M( \boldsymbol c)\f$ are matrices. The \f$ B( \boldsymbol c) \f$ matrix,
also known as the bread, is relatively straight forward, and can be computed as
\f[
B( \boldsymbol c) = n\left(\sum_i^n -H(y_i, x_i, \boldsymbol c) \right)^{-1}
\f]
where \f$ H \f$ is the hessian matrix.
The \f$ M( \boldsymbol c)\f$ matrix has several variation, each with different robustness properties.
The form implemented here is the Huber-White sandwich operator, which takes the form
\f[
M_{H} =\frac{1}{n} \sum_i^n \psi(y_i,x_i, \boldsymbol c)^T \psi(y_i,x_i, \boldsymbol c).
\f]
The above method for calculating robust variance (Huber-White estimates) is implemented for linear regression, logistic regression, and multinomial logistic regression. It is useful in calculating variances in a dataset with potentially noisy outliers. The Huber-White implemented here is identical to the "HC0" sandwich operator in the R module "sandwich".
The interface for robust linear, logistic, and multinomial logistic regression are similar, differing only in the optimal parameters. Calling the help and usage functions is identical across all three robust regressions.
When multinomial logistic regression is computed before the multinomial robust
regression, it uses a default reference category of zero and the regression coefficients are included in the output table. The regression
coefficients in the output are in the same order as multinomial logistic
regression function, which is described below.
For a problem with
\f$ K \f$ dependent variables \f$ (1, ..., K) \f$ and \f$ J \f$ categories \f$ (0, ..., J-1)
\f$, let \f$ {m_{k,j}} \f$ denote the coefficient for dependent variable \f$ k
\f$ and category \f$ j \f$ . The output is \f$ {m_{k_1, j_0}, m_{k_1, j_1}
\ldots m_{k_1, j_{J-1}}, m_{k_2, j_0}, m_{k_2, j_1} \ldots m_{k_K, j_{J-1}}} \f$.
The order is NOT CONSISTENT with the multinomial regression marginal effect
calculation with function <em>marginal_mlogregr</em>. This is deliberate
because the interfaces of all multinomial regressions (robust, clustered, ...)
will be moved to match that used in marginal.
@input
The training data is expected to be of the following form:
<pre>{TABLE|VIEW} <em>sourceName</em> (
<em>outputTable</em> VARCHAR,
<em>regressionType </em> VARCHAR,
<em>dependentVariable</em> VARCHAR,
<em>independentVariable</em> VARCHAR
)</pre>
@usage
<b> The Full Interface</b>
\warning The \b 'groupingCol' and \b 'print_warnings' input parameter for \e robust_variance_mlogregr
is a placeholder in the Madlib V1.0. These input parameters will be implemented in a
future release.
<pre>
SELECT \ref madlib.robust_variance_linregr(
<em>'source_table'</em>, -- name of input table, VARCHAR
<em>'out_table'</em>, -- name of output table, VARCHAR
<em>'dependent_varname'</em>, -- dependent variable, VARCHAR
<em>'independent_varname'</em>, -- independent variables, VARCHAR
<em>'grouping_cols'</em> -- [OPTIONAL] grouping variables, VARCHAR
);
</pre>
OR
<pre>
SELECT \ref madlib.robust_variance_logregr(
<em>'source_table'</em>, -- name of input table, VARCHAR
<em>'out_table'</em>, -- name of output table, VARCHAR
<em>'dependent_varname'</em>, -- dependent variable, VARCHAR
<em>'independent_varname'</em>, -- independent variables, VARCHAR
<em>'grouping_cols'</em>, -- [OPTIONAL] grouping variables, VARCHAR
<em>max_iter</em>, -- [OPTIONAL] Integer identifying the maximum iterations used by the logistic regression solver. Default is 20, Integer
<em>'optimizer'</em>, -- [OPTIONAL] String identifying the optimizer used in the logistic regression. See the documentation in the logistic regression for the available options. Default is irls. VARCHAR
<em>tolerance</em>, -- [OPTIONAL] Float identifying the tolerance of the logistic regression optimizer. Default is 0.0001. DOUBLE PRECISION
<em>print_warnings</em> -- [OPTIONAL] Boolean specifying if the regression fit should print any warning messages. Default is false. BOOLEAN
);
</pre>
OR
<pre>
SELECT \ref madlib.robust_variance_mlogregr(
<em>'source_table'</em>, -- name of input table, VARCHAR
<em>'out_table'</em>, -- name of output table, VARCHAR
<em>'dependent_varname'</em>, -- dependent variable, VARCHAR
<em>'independent_varname'</em>, -- independent variables, VARCHAR
<em>ref_category</em>, -- [OPTIONAL] Integer specifying the reference category. Default is 0.
<em>'grouping_cols'</em>, -- [OPTIONAL] grouping variables, VARCHAR. Default is NULL.
<em>max_iter</em>, -- [OPTIONAL] Integer identifying the maximum iterations used by the logistic regression solver. Default is 20.
<em>'optimizer'</em>, -- [OPTIONAL] String identifying the optimizer used in the multinomial logistic regression. See the documentation in the multinomial logistic regression for the available options. Default is irls.
<em>tolerance</em>, -- [OPTIONAL] Float identifying the tolerance of the multinomial logistic regression optimizer. Default is 0.0001.
<em>print_warnings</em> -- [OPTIONAL] Boolean specifying if the regression fit should print any warning messages. Default is false.
);
</pre>
Here the <em>'independent_varname'</em> can be the name of a column, which contains
array of numeric values. It can also have a format of string 'array[1, x1, x2, x3]',
where <em>x1</em>, <em>x2</em> and <em>x3</em> are all column names.
Output is stored in the <em>out_table</em>:
<pre>
[ coef | std_err | (z/t)-stats | p_values |
+------+---------+-------------+----------+
</pre>
@examp
-# For function summary information. Run
@verbatim
sql> select robust_variance_{linregr OR logregr OR mlogregr}('help');
OR
sql> select robust_variance_{linregr OR logregr OR mlogregr}();
OR
sql> select robust_variance_{linregr OR logregr OR mlogregr}('?');
@endverbatim
-# For function usage information.
@verbatim
sql> select robust_variance_{linregr OR logregr OR mlogregr}('usage');
@endverbatim
-# Create the sample data set:
@verbatim
sql> SELECT * FROM data;
id | second_attack | treatment | trait_anxiety
----+---------------+-----------+---------------
1 | 1 | 1 | 70
3 | 1 | 1 | 50
5 | 1 | 0 | 40
7 | 1 | 0 | 75
9 | 1 | 0 | 70
11 | 0 | 1 | 65
13 | 0 | 1 | 45
15 | 0 | 1 | 40
17 | 0 | 0 | 55
...
@endverbatim
-# Run the logistic regression function and then compute the robust logistic variance of the regression:
@verbatim
sql> select robust_variance_logregr('patients', 'newTable', 'second_attack', 'ARRAY[1, treatment, trait_anxiety]');
sql> select * from newTable;
coef | {11.962748350258,1.37269168529894,0.00285507335100035}
std_err | {3.45872062333141,1.17161925782182,0.053432886418388}
z_stats | {-1.839833462942,-0.874094587942144,2.22793348156965}
p_values | {0.0657926909738772,0.382066744586027,0.0258849510756295}
@endverbatim
@literature
[1] vce(cluster) function in STATA: http://www.stata.com/help.cgi?vce_option
[2] clustered estimators in R: http://people.su.se/~ma/clustering.pdf
[3] Achim Zeileis: Object-oriented Computation of Sandwich Estimators.
Research Report Series / Department of Statistics and Mathematics, 37.
Department of Statistics and Mathematics, WU Vienna University of Economics and Business, Vienna.
http://cran.r-project.org/web/packages/sandwich/vignettes/sandwich-OOP.pdf
@sa File robust.sql_in documenting the SQL functions.
@internal
@sa Namespace \ref madlib::modules::regress
documenting the implementation in C++
@endinternal
*/
/**
* @internal
* @bDoes error checking on the input for the robust variance regressions for the mandatory inputs.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.robust_input_checking(
source_table VARCHAR -- name of input table
, out_table VARCHAR -- name of output table
, dependent_varname VARCHAR -- name of dependent variable
, independent_varname VARCHAR -- name of independent variable
)
RETURNS VOID AS $$
BEGIN
IF (source_table IS NULL OR source_table = '') THEN
RAISE EXCEPTION 'Invalid input table name given.';
END IF;
IF (NOT MADLIB_SCHEMA.__table_exists(source_table)) THEN
RAISE EXCEPTION 'Input table name does not exist.';
END IF;
IF (out_table IS NULL OR out_table = '') THEN
RAISE EXCEPTION 'Invalid output table name given.';
END IF;
IF (MADLIB_SCHEMA.__table_exists(out_table)) THEN
RAISE EXCEPTION 'Output table name already exists. Drop the table before calling the function.';
END IF;
IF (independent_varname IS NULL OR independent_varname = '') THEN
RAISE EXCEPTION 'Invalid independent variable name given.';
END IF;
IF (dependent_varname IS NULL OR dependent_varname = '') THEN
RAISE EXCEPTION 'Invalid dependent variable name given.';
END IF;
END;
$$ LANGUAGE plpgsql VOLATILE;
--------------------------- Robust Linear Regression ----------------------------------
CREATE TYPE MADLIB_SCHEMA.robust_linregr_result AS (
coef DOUBLE PRECISION[],
std_err DOUBLE PRECISION[],
t_stats DOUBLE PRECISION[],
p_values DOUBLE PRECISION[]
);
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.robust_linregr_transition(
state MADLIB_SCHEMA.bytea8,
y DOUBLE PRECISION,
x DOUBLE PRECISION[],
coef DOUBLE PRECISION[])
RETURNS MADLIB_SCHEMA.bytea8
AS 'MODULE_PATHNAME'
LANGUAGE C
IMMUTABLE STRICT;
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.robust_linregr_merge_states(
state1 MADLIB_SCHEMA.bytea8,
state2 MADLIB_SCHEMA.bytea8)
RETURNS MADLIB_SCHEMA.bytea8
AS 'MODULE_PATHNAME'
LANGUAGE C
IMMUTABLE STRICT;
-- Final functions
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.robust_linregr_final(
state MADLIB_SCHEMA.bytea8)
RETURNS MADLIB_SCHEMA.robust_linregr_result
AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE STRICT;
/**
* @brief Compute robust regression diagnostic statistics for linear regression.
*
* @param dependentVariable Column containing the dependent variable
* @param independentVariables Column containing the array of independent variables
* @param coef Column containing the array of the OLS coefficients (as obtained by linregr)
*
* @par
* To include an intercept in the model, set one coordinate in the
* <tt>independentVariables</tt> array to 1.
*
* @return A composite value:
* - <tt>std_err FLOAT8[]</tt> - Array of huber-white standard errors,
* \f$ \mathit{se}(c_1), \dots, \mathit{se}(c_k) \f$
* - <tt>t_stats FLOAT8[]</tt> - Array of t-statistics, \f$ \boldsymbol t \f$
* - <tt>p_values FLOAT8[]</tt> - Array of p-values, \f$ \boldsymbol p \f$
*
* @usage
* - Get all the diagnostic statistics:\n
*
* <pre> SELECT (robust_linregr(<em>dependentVariable</em>,
* <em>independentVariables</em>, coef)).*
* FROM (
* SELECT linregr(<em>dependentVariable</em>, <em>independentVariables</em>).coef
* ) AS ols_coef, <em>sourceName</em> as src;
* </pre>
* - Get a subset of the output columns, e.g., only the condition number
* and the array of p-values \f$ \boldsymbol p \f$:
* <pre>SELECT (lr).robust_condition_no, (lr).robust_p_values
*FROM (
* </pre>
* <pre> SELECT (robust_linregr(<em>dependentVariable</em>,
* <em>independentVariables</em>, coef)).*
* FROM (
* SELECT linregr(<em>dependentVariable</em>, <em>independentVariables</em>).coef
* ) AS ols_coef, <em>sourceName</em> as src
*) AS subq;</pre>
*/
CREATE AGGREGATE MADLIB_SCHEMA.robust_linregr(
/*+ "dependentVariable" */ DOUBLE PRECISION,
/*+ "independentVariables" */ DOUBLE PRECISION[],
/*+ "coef" */ DOUBLE PRECISION[]) (
SFUNC=MADLIB_SCHEMA.robust_linregr_transition,
STYPE=MADLIB_SCHEMA.bytea8,
FINALFUNC=MADLIB_SCHEMA.robust_linregr_final,
m4_ifdef(`__GREENPLUM__',`prefunc=MADLIB_SCHEMA.robust_linregr_merge_states,')
INITCOND=''
);
--------------------------- INTERNAL ---------------------------------------
/**
* @brief Return robust linear regression estimates given a set of coefficients
**/
CREATE FUNCTION MADLIB_SCHEMA.__internal_get_robust_linregr_result(
source_table VARCHAR -- name of input table
, dependent_varname VARCHAR -- name of dependent variable
, independent_varname VARCHAR -- name of independent variable
, linregr_coeffs DOUBLE PRECISION[] -- coeffs from linear regression
)
RETURNS MADLIB_SCHEMA.robust_linregr_result AS $$
DECLARE
robust_value MADLIB_SCHEMA.robust_linregr_result;
BEGIN
EXECUTE '
SELECT (MADLIB_SCHEMA.robust_linregr('
|| dependent_varname || ' , '
|| independent_varname || ' , '
|| 'ARRAY[' || array_to_string(linregr_coeffs, ',') || '])
).* FROM ' || source_table
INTO robust_value;
RETURN robust_value;
END
$$ LANGUAGE plpgsql VOLATILE;
/**
* @brief Return insert string for robust linear regression
**/
CREATE FUNCTION MADLIB_SCHEMA.__internal_get_robust_linregr_insert_string(
robust_lin_rst MADLIB_SCHEMA.robust_linregr_result
, linregr_coeffs DOUBLE PRECISION[] -- coeffs from linear regression
, out_table TEXT
)
RETURNS VARCHAR AS $$
DECLARE
insert_string VARCHAR;
BEGIN
insert_string := 'INSERT INTO ' || out_table || ' VALUES (';
insert_string := insert_string ||
CASE
WHEN linregr_coeffs is NULL
THEN '''{}'','
ELSE 'ARRAY[' || array_to_string(linregr_coeffs, ',') || '], '
END ||
CASE
WHEN (robust_lin_rst).std_err is NULL
THEN '''{}'','
ELSE 'ARRAY[' || array_to_string((robust_lin_rst).std_err, ',') || '], '
END ||
CASE
WHEN (robust_lin_rst).t_stats is NULL
THEN '''{}'','
ELSE 'ARRAY[' || array_to_string((robust_lin_rst).t_stats, ',') || '], '
END ||
CASE
WHEN (robust_lin_rst).p_values is NULL
THEN '''{}'','
ELSE 'ARRAY[' || array_to_string((robust_lin_rst).p_values, ',') || ']'
END;
RETURN insert_string;
END;
$$ LANGUAGE plpgsql VOLATILE;
--------------------------- Interface ----------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.robust_variance_linregr(
usage_string VARCHAR -- usage string
)
RETURNS VARCHAR AS $$
DECLARE
insert_string VARCHAR;
BEGIN
IF (usage_string = '' OR usage_string = 'help' OR usage_string = '?') THEN
insert_string := '' ||
E'Summary \n' ||
E'-----------------------------------------------------------------------------------------\n' ||
E' Functionality: Calculate Huber-White robust statistics for linear regression\n' ||
E' The function first runs the regression to calculate the \n' ||
E' coefficients and uses them to calculate the robust statistics \n' ||
E' The functions robust_variance_linregr() and robust_variance_logregr() have nearly identical interfaces\n' ||
E' SELECT {schema_madlib}.robust_variance_linregr(''source_table'' \n' ||
E' ,''output_table'' \n' ||
E' ,''dependent_variable'' \n' ||
E' ,''independent_variable'' \n' ||
E' ,''group_cols'' \n' ||
E' );\n' ||
E'For more details on function usage: \n' ||
E'SELECT {schema_madlib}.robust_variance_linregr(''usage'') \n';
ElSIF (usage_string = 'usage') THEN
insert_string := '' ||
E'Usage\n' ||
E'-----------------------------------------------------------------------------------------\n' ||
E' To use this function \n' ||
E' SELECT {schema_madlib}.robust_variance_linregr( \n' ||
E' ''source_table'', -- Name of data table \n' ||
E' ''output_table'', -- Name of result table \n' ||
E' ''dependent_variable'', -- Name of column for dependent variables\n' ||
E' ''independent_variable'', -- Name of column for independent variables\n' ||
E' (can be any SQL expression that evaluates to an array) \n' ||
E' ''group_cols'', -- [OPTIONAL] Comma separated string with columns to group by. Default is NULL. \n' ||
E' );\n' ||
E'\n' ||
E'Output:\n' ||
E'-----------------------------------------------------------------------------------------\n' ||
E' The output table (''output_table'' above) has the following columns\n' ||
E' ''coef'' DOUBLE PRECISION[], -- Coefficients of regression \n' ||
E' ''std_err'' DOUBLE PRECISION[], -- Huber-White standard errors\n' ||
E' ''stats'' DOUBLE PRECISION[], -- T-stats of the standard errors\n' ||
E' ''p_values'' DOUBLE PRECISION[], -- p-values of the standard errors\n' ||
E'\n' ||
E'';
ELSE
insert_string := 'No such option. Run SELECT {schema_madlib}.robust_variance_linregr()';
END IF;
RETURN insert_string;
END;
$$ LANGUAGE plpgsql VOLATILE;
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.robust_variance_linregr()
RETURNS VARCHAR AS $$
BEGIN
RETURN MADLIB_SCHEMA.robust_variance_linregr('');
END;
$$ LANGUAGE plpgsql VOLATILE;
/**
* @brief Robust linear regression with default fit regression behaviour & no grouping
**/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.robust_variance_linregr(
source_table VARCHAR -- name of input table
, out_table VARCHAR -- name of output table
, dependent_variable VARCHAR -- name of dependent variable
, independent_variable VARCHAR -- name of independent variable
)
RETURNS VOID AS $$
BEGIN
PERFORM MADLIB_SCHEMA.robust_variance_linregr(
source_table,
out_table,
dependent_variable,
independent_variable,
NULL);
END;
$$ LANGUAGE plpgsql VOLATILE;
--------------------------- Robust Linear Regression ----------------------------------
/**
* @brief Robust linear regression function subcall
**/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.robust_variance_linregr(
source_table VARCHAR -- name of input table
, out_table VARCHAR -- name of output table
, dependent_varname VARCHAR -- name of dependent variable
, input_independent_varname VARCHAR -- name of independent variable
, input_group_cols VARCHAR -- grouping columns
)
RETURNS VOID AS $$
DECLARE
insert_string VARCHAR;
group_cols VARCHAR[];
regr_coef DOUBLE PRECISION[];
robust_lin_rst MADLIB_SCHEMA.robust_linregr_result;
table_creation_string VARCHAR;
group_string VARCHAR;
where_group_string VARCHAR;
actual_table_name VARCHAR;
input_table_name VARCHAR[];
group_array_length INTEGER;
each_group INTEGER;
robust_linregr_fitting_rst VARCHAR;
temp_table VARCHAR;
col_data_type VARCHAR;
schema_name VARCHAR;
sql VARCHAR;
old_msg_level TEXT;
independent_varname VARCHAR;
BEGIN
EXECUTE 'SELECT setting FROM pg_settings WHERE name=''client_min_messages''' INTO old_msg_level;
EXECUTE 'SET client_min_messages TO warning';
PERFORM MADLIB_SCHEMA.robust_input_checking(source_table, out_table, dependent_varname, input_independent_varname);
independent_varname := input_independent_varname;
--This code should be added back in should '*' support be implemented
/*IF (input_independent_varname = '*') THEN
independent_varname := MADLIB_SCHEMA.__internal_get_col_names_except_dep_variable(source_table, input_independent_varname);
ELSE
independent_varname := input_independent_varname;
END IF;*/
IF (input_group_cols IS NULL)
THEN
-------------------------------------------------------------------------
-- No grouping columns: Run regular robust variance estimates
-------------------------------------------------------------------------
-- Step 1: Create output table with appropriate column names
EXECUTE '
CREATE TABLE ' || out_table || ' (
coef DOUBLE PRECISION[],
std_err DOUBLE PRECISION[],
t_stats DOUBLE PRECISION[],
p_values DOUBLE PRECISION[])';
-- Step 2: Run the regression if the coefficients are not provided
EXECUTE '
SELECT (MADLIB_SCHEMA.linregr('
|| dependent_varname || ' , '
|| independent_varname || ')
).coef FROM ' || source_table
INTO regr_coef;
-- Step 3: Perform the robust linear regression
robust_lin_rst := MADLIB_SCHEMA.__internal_get_robust_linregr_result(
source_table, dependent_varname, independent_varname, regr_coef);
-- Step 4: Insert into output table & cast infinity and nan
insert_string := MADLIB_SCHEMA.__internal_get_robust_linregr_insert_string(
robust_lin_rst, regr_coef, out_table);
insert_string := REGEXP_REPLACE(insert_string, 'Infinity',
'''Infinity''::double precision', 'gi');
insert_string := REGEXP_REPLACE(insert_string, 'NaN',
'''NaN''::double precision', 'gi');
EXECUTE insert_string || ')';
ELSE
-------------------------------------------------------------------------
-- Grouping columns: Run robust variance estimates for each group
-------------------------------------------------------------------------
-- Step 1: Prepare the group columns
-------------------------------------------------------------------------
group_cols = MADLIB_SCHEMA._string_to_array(input_group_cols);
group_array_length = array_upper(group_cols, 1);
input_table_name = regexp_split_to_array(source_table, E'\\.');
IF array_upper(input_table_name, 1) = 1 THEN
actual_table_name = input_table_name[1];
schema_name := current_schema();
ELSIF array_upper(input_table_name, 1) = 2 THEN
actual_table_name = input_table_name[2];
schema_name = input_table_name[1];
ELSE
RAISE EXCEPTION 'Incorrect input source table name provided';
END IF;
-- Check that each grouping column exists
FOR each_group in 1 .. group_array_length
LOOP
if NOT MADLIB_SCHEMA.check_if_col_exists(source_table,
group_cols[each_group]) THEN
RAISE EXCEPTION 'Grouping column % does not exist',
group_cols[each_group];
END IF;
END LOOP;
table_creation_string := 'CREATE TABLE ' || out_table || '(';
FOR each_group in 1 .. group_array_length
LOOP
-- create a string that makes list of
EXECUTE 'SELECT data_type FROM information_schema.columns
WHERE
table_schema = ''' || schema_name || '''
AND table_name = ''' || actual_table_name || '''
AND column_name= ''' || group_cols[each_group] || ''''
INTO col_data_type;
table_creation_string := table_creation_string
|| group_cols[each_group]
|| ' ' || col_data_type || ',';
END LOOP;
-- finish creating the output table
EXECUTE table_creation_string || '
coef DOUBLE PRECISION[],
std_err DOUBLE PRECISION[],
t_stats DOUBLE PRECISION[],
p_values DOUBLE PRECISION[])';
-- Step 2: Perform the linear regression on groups
-------------------------------------------------------------------------
temp_table := MADLIB_SCHEMA.__unique_string();
PERFORM MADLIB_SCHEMA.linregr_train(source_table,
temp_table,
dependent_varname,
independent_varname,
input_group_cols);
-- Note: We need to alter the names on the output tables for linregr_train
-- so that they don't clash with the robust_variance_linregr output tables
-- This name changing is required to construct queries when the regressor
-- is also being grouped by. I don't know why anyone would do that but
-- it should be a mathematically valid operation.
FOR each_group in 1 .. group_array_length
LOOP
EXECUTE 'ALTER TABLE ' || temp_table || ' RENAME COLUMN ' ||
group_cols[each_group] || ' TO lin_regr_'
|| group_cols[each_group];
END LOOP;
-- Step 3: Prepare the grouping strings
group_string := '';
FOR each_group in 1 .. (group_array_length-1)
LOOP
group_string := group_string ||
actual_table_name || '.' ||
group_cols[each_group] || ',';
END LOOP;
group_string := group_string ||
actual_table_name || '.' ||
group_cols[group_array_length];
-- Where clause string
where_group_string := '';
FOR each_group in 1 .. (group_array_length-1)
LOOP
where_group_string := where_group_string || '(' ||
temp_table || '.lin_regr_' ||
group_cols[each_group] || ' = ' ||
actual_table_name || '.' ||
group_cols[each_group] || ') AND ';
END LOOP;
where_group_string := where_group_string || '(' ||
temp_table || '.lin_regr_' ||
group_cols[group_array_length] || ' = ' ||
actual_table_name || '.' ||
group_cols[group_array_length] || ')';
-- Step 4: Put robust statistics into the out table
EXECUTE
'INSERT INTO ' || out_table || '
SELECT ' || group_string || ', (result).coef, (result).std_err,
(result).t_stats, (result).p_values
FROM (
SELECT
'|| group_string ||',
MADLIB_SCHEMA.robust_linregr('||
dependent_varname ||','||
independent_varname || ', coef) AS result' ||
' FROM '|| source_table || ',' || temp_table ||
' WHERE '|| where_group_string ||
' GROUP BY '|| group_string || ')' || actual_table_name;
EXECUTE 'DROP TABLE IF EXISTS ' || temp_table;
END IF;
--Restore message settings
EXECUTE 'SET client_min_messages TO '|| old_msg_level;
END;
$$ LANGUAGE plpgsql VOLATILE;
--------------------------- ROBUST LOGISTIC REGRESSION ---------------------------------------
/**
* @internal
* @bCompute the results of the Huber-White sandwich estimator
*
*/
--DROP TYPE IF EXISTS MADLIB_SCHEMA.robust_logregr_result;
CREATE TYPE MADLIB_SCHEMA.robust_logregr_result AS (
coef DOUBLE PRECISION[],
std_err DOUBLE PRECISION[],
z_stats DOUBLE PRECISION[],
p_values DOUBLE PRECISION[]
);
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.robust_logregr_step_transition(
state DOUBLE PRECISION[],
y BOOLEAN,
x DOUBLE PRECISION[],
coef DOUBLE PRECISION[])
RETURNS DOUBLE PRECISION[]
AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE;
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.robust_logregr_step_merge_states(
state1 DOUBLE PRECISION[],
state2 DOUBLE PRECISION[])
RETURNS DOUBLE PRECISION[]
AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE STRICT;
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.robust_logregr_step_final(
state DOUBLE PRECISION[])
RETURNS MADLIB_SCHEMA.robust_logregr_result
AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE STRICT;
/**
* @brief Compute robust regression diagnostic statistics for logistic regression.
*
* @param dependentVariable Column containing the dependent variable
* @param independentVariables Column containing the array of independent variables
* @param coef Column containing the array of the coefficients (as obtained by logregr)
*
* @par
* To include an intercept in the model, set one coordinate in the
* <tt>independentVariables</tt> array to 1.
*
* @return A composite value:
* - <tt>coef FLOAT8[] </tt> - The coefficients for the regression
* - <tt>std_err FLOAT8[]</tt> - Array of huber-white standard errors,
* \f$ \mathit{se}(c_1), \dots, \mathit{se}(c_k) \f$
* - <tt>z_stats FLOAT8[]</tt> - Array of Wald z-statistics, \f$ \boldsymbol t \f$
* - <tt>p_values FLOAT8[]</tt> - Array of p-values, \f$ \boldsymbol p \f$
*
* @usage
* - Get all the diagnostic statistics:\n
*
* <pre> SELECT robust_logregr(<em>dependentVariable</em>,
* <em>independentVariables</em>, coef)
* FROM <em>dataTable</em>;
* </pre>
*/
CREATE AGGREGATE MADLIB_SCHEMA.robust_logregr(
/*+ "dependentVariable" */ BOOLEAN,
/*+ "independentVariables" */ DOUBLE PRECISION[],
/*+ "coef" */ DOUBLE PRECISION[] ) (
STYPE=DOUBLE PRECISION[],
SFUNC=MADLIB_SCHEMA.robust_logregr_step_transition,
m4_ifdef(`GREENPLUM',`prefunc=MADLIB_SCHEMA.robust_logregr_step_merge_states,')
FINALFUNC=MADLIB_SCHEMA.robust_logregr_step_final,
INITCOND='{0,0,0,0,0.0}'
);
--------------------------- INTERNAL ---------------------------------------
/**
* @brief Return robust logistic regression estimates given a set of coefficients
**/
CREATE FUNCTION MADLIB_SCHEMA.__internal_get_robust_logregr_result(
source_table VARCHAR -- name of input table
, dependent_varname VARCHAR -- name of dependent variable
, independent_varname VARCHAR -- name of independent variable
, logregr_coeffs DOUBLE PRECISION[] -- coeffs from logear regression
)
RETURNS MADLIB_SCHEMA.robust_logregr_result AS $$
DECLARE
robust_value MADLIB_SCHEMA.robust_logregr_result;
BEGIN
EXECUTE 'SELECT (MADLIB_SCHEMA.robust_logregr(('|| dependent_varname || ' )::BOOLEAN, '|| independent_varname || ' , '|| 'ARRAY[' || array_to_string(logregr_coeffs, ',') || '])).* FROM ' || source_table
INTO robust_value;
RETURN robust_value;
END
$$ LANGUAGE plpgsql VOLATILE;
/**
* @brief Return insert string for robust logistic regression
**/
CREATE FUNCTION MADLIB_SCHEMA.__internal_get_robust_logregr_insert_string(
robust_log_rst MADLIB_SCHEMA.robust_logregr_result,
out_table TEXT
)
RETURNS VARCHAR AS $$
DECLARE
insert_string VARCHAR;
BEGIN
insert_string := 'INSERT INTO ' || out_table || ' VALUES (';
insert_string := insert_string ||
CASE
WHEN (robust_log_rst).coef is NULL
THEN '''{}'','
ELSE 'ARRAY[' || array_to_string((robust_log_rst).coef, ',') || '], '
END ||
CASE
WHEN (robust_log_rst).std_err is NULL
THEN '''{}'','
ELSE 'ARRAY[' || array_to_string((robust_log_rst).std_err, ',') || '], '
END ||
CASE
WHEN (robust_log_rst).z_stats is NULL
THEN '''{}'','
ELSE 'ARRAY[' || array_to_string((robust_log_rst).z_stats, ',') || '], '
END ||
CASE
WHEN (robust_log_rst).p_values is NULL
THEN '''{}'','
ELSE 'ARRAY[' || array_to_string((robust_log_rst).p_values, ',') || '] '
END;
RETURN insert_string;
END;
$$ LANGUAGE plpgsql VOLATILE;
--------------------------- Interface ----------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.robust_variance_logregr(
usage_string VARCHAR -- usage string
)
RETURNS VARCHAR AS $$
DECLARE
insert_string VARCHAR;
BEGIN
IF (usage_string = '' OR usage_string = 'help' OR usage_string = '?') THEN
insert_string := '' ||
E'Summary \n' ||
E'-----------------------------------------------------------------------------------------\n' ||
E' Functionality: Calculate Huber-White robust statistics for logistic regression\n' ||
E' The function first runs the regression to calculate the \n' ||
E' coefficients and uses them to calculate the robust statistics \n' ||
E' The functions robust_variance_linregr() and robust_variance_logregr() have nearly identical interfaces\n' ||
E' SELECT {schema_madlib}.robust_variance_logregr(''source_table'' \n' ||
E' ,''output_table'' \n' ||
E' ,''dependent_variable'' \n' ||
E' ,''independent_variable'' \n' ||
E' ,''group_cols'' \n' ||
E' ,''max_iter'' \n' ||
E' ,''optimizer'' \n' ||
E' ,''tolerance'' \n' ||
E' ,''print_warnings'' \n' ||
E' );\n' ||
E'For more details on function usage: \n' ||
E'SELECT {schema_madlib}.robust_variance_logregr(''usage'') \n';
ElSIF (usage_string = 'usage') THEN
insert_string := '' ||
E'Usage\n' ||
E'-----------------------------------------------------------------------------------------\n' ||
E' To use this function \n' ||
E' SELECT {schema_madlib}.robust_variance_logregr( \n' ||
E' ''source_table'', -- Name of data table \n' ||
E' ''output_table'', -- Name of result table \n' ||
E' ''dependent_variable'', -- Name of column for dependent variables\n' ||
E' ''independent_variable'', -- Name of column for independent variables\n' ||
E' (can be any SQL expression that evaluates to an array) \n' ||
E' ''group_cols'', -- [OPTIONAL] Comma separated string with columns to group by. Default is NULL. \n' ||
E' ''max_iter'', -- [OPTIONAL] The number of iterations used by the logistic regression solver. Default is 20. \n' ||
E' ''optimizer'', -- [OPTIONAL] Name of the optimizer used in the logistic regression. Default is irls. \n' ||
E' ''tolerance'', -- [OPTIONAL] The tolerance of the logistic regression optimizer. Default is 0.0001. \n' ||
E' ''print_warnings'', -- [OPTIONAL] Should the optimizer print warning messages to the screen. Default is FALSE. \n' ||
E' );\n' ||
E'\n' ||
E'Output:\n' ||
E'-----------------------------------------------------------------------------------------\n' ||
E' The output table (''output_table'' above) has the following columns\n' ||
E' ''coef'' DOUBLE PRECISION[], -- Coefficients of regression \n' ||
E' ''std_err'' DOUBLE PRECISION[], -- Huber-White standard errors\n' ||
E' ''stats'' DOUBLE PRECISION[], -- Z-stats of the standard errors\n' ||
E' ''p_values'' DOUBLE PRECISION[], -- p-values of the standard errors\n' ||
E'\n' ||
E'';
ELSE
insert_string := 'No such option. Run SELECT {schema_madlib}.robust_variance_logregr()';
END IF;
RETURN insert_string;
END;
$$ LANGUAGE plpgsql VOLATILE;
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.robust_variance_logregr()
RETURNS VARCHAR AS $$
BEGIN
RETURN MADLIB_SCHEMA.robust_variance_logregr('');
END;
$$ LANGUAGE plpgsql VOLATILE;
/**
*@brief The robust logistic regression function.
*
* @param source_table String identifying the input table
* @param out_table String identifying the output table to be created
* @param dependent_varname Column containing the dependent variable
* @param independent_varname Column containing the array of independent variables
* @param input_group_cols Columns to group by.
* @param max_iter Integer identifying the maximum iterations used by the logistic regression solver. Default is 20.
* @param optimizer String identifying the optimizer used in the logistic regression. See the documentation in the logistic regression for the available options. Default is irls.
* @param tolerance Float identifying the tolerance of the logistic regression optimizer. Default is 0.0001.
* @param print_warnings Boolean specifying if the regression fit should print any warning messages. Default is false.
* @par
* To include an intercept in the model, set one coordinate in the
* <tt>independent_varname</tt> array to 1.
*
* @return A composite value:
* - <tt>std_err FLOAT8[]</tt> - Array of huber-white standard errors,
* \f$ \mathit{se}(c_1), \dots, \mathit{se}(c_k) \f$
* - <tt>t_stats FLOAT8[]</tt> - Array of t-statistics, \f$ \boldsymbol t \f$
* - <tt>p_values FLOAT8[]</tt> - Array of p-values, \f$ \boldsymbol p \f$
*
* @usage
* For function summary information. Run
* sql> select robust_variance_logregr('help');
* OR
* sql> select robust_variance_logregr();
* OR
* sql> select robust_variance_logregr('?');
* For function usage information. Run
* sql> select robust_variance_logregr('usage');
* - Compute the coefficients, and the get the robust diagnostic statistics:
* <pre>
* select robust_variance_logregr(source_table, out_table, regression_type, dependentVariable, independentVariables, NULL );
* </pre>
*
* - If the coefficients are already known, they can be provided directly
* <pre>select robust_variance_logregr(source_table, out_table, regression_type, dependentVariable, independentVariables, coef );
</pre>
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.robust_variance_logregr(
source_table VARCHAR, -- name of input table
out_table VARCHAR, -- name of output table
dependent_varname VARCHAR, -- name of dependent variable
input_independent_varname VARCHAR, -- name of independent variable
input_group_cols VARCHAR, -- grouping columns
max_iter INTEGER, -- maximum of iterations in the optimizer
optimizer VARCHAR, -- the optimizer used
tolerance DOUBLE PRECISION,
print_warnings BOOLEAN
)
RETURNS VOID AS $$
DECLARE
insert_string VARCHAR;
group_cols VARCHAR[];
regr_coef DOUBLE PRECISION[];
table_creation_string VARCHAR;
group_string VARCHAR;
where_group_string VARCHAR;
actual_table_name VARCHAR;
input_table_name VARCHAR[];
group_array_length INTEGER;
each_group INTEGER;
robust_logregr_fitting_rst VARCHAR;
temp_table VARCHAR;
col_data_type VARCHAR;
schema_name VARCHAR;
sql VARCHAR;
tempTableName VARCHAR;
robust_log_rst MADLIB_SCHEMA.robust_logregr_result;
old_msg_level TEXT;
independent_varname VARCHAR;
print_warningsString VARCHAR;
BEGIN
EXECUTE 'SELECT setting FROM pg_settings WHERE name=''client_min_messages''' INTO old_msg_level;
EXECUTE 'SET client_min_messages TO warning';
PERFORM MADLIB_SCHEMA.robust_input_checking(source_table, out_table, dependent_varname, input_independent_varname);
independent_varname := input_independent_varname;
--This code should be added back in should '*' support be implemented
--IF (input_independent_varname = '*') THEN
-- independent_varname := MADLIB_SCHEMA.__internal_get_col_names_except_dep_variable(source_table, input_independent_varname);
-- ELSE
-- independent_varname := input_independent_varname;
-- END IF;
IF (input_group_cols IS NULL)
THEN
-- create output table with appropriate column names
EXECUTE '
CREATE TABLE ' || out_table || ' (
coef DOUBLE PRECISION[],
std_err DOUBLE PRECISION[],
z_stats DOUBLE PRECISION[],
p_values DOUBLE PRECISION[])';
-- Run the regression if the coefficients are not provided
print_warningsString := 'FALSE';
IF(print_warnings IS TRUE)
THEN
print_warningsString := 'TRUE';
END IF;
tempTableName := MADLIB_SCHEMA.__unique_string();
EXECUTE '
SELECT MADLIB_SCHEMA.logregr_train('
|| ' '''||source_table||''' ' || ','
|| ' '''||tempTableName||''' ' || ','
|| ' '''||dependent_varname ||''' ' || ' , '
|| ' '''||independent_varname||''' '||' , '
|| 'NULL'|| ' , '
||max_iter || ' , '
|| ' '''||optimizer|| ''' '|| ' , '
|| tolerance||' , '
|| print_warningsString||
')';
EXECUTE 'SELECT coef from ' || tempTableName INTO regr_coef;
EXECUTE 'DROP TABLE IF EXISTS ' || tempTableName;
-- compute robust variance calculation
robust_log_rst := MADLIB_SCHEMA.__internal_get_robust_logregr_result(
source_table, dependent_varname, independent_varname, regr_coef);
insert_string := MADLIB_SCHEMA.__internal_get_robust_logregr_insert_string(
robust_log_rst, out_table);
-- Ensure Infinity and NaN are cast properly
insert_string := REGEXP_REPLACE(insert_string, 'Infinity',
'''Infinity''::double precision', 'gi');
insert_string := REGEXP_REPLACE(insert_string, 'NaN',
'''NaN''::double precision', 'gi');
-- complete the sql string and execute
EXECUTE insert_string || ')';
ELSE
-------------------------------------------------------------------------
-- Grouping columns: Run robust variance estimates for each group
-------------------------------------------------------------------------
-- Step 1: Prepare the group columns
-------------------------------------------------------------------------
group_cols = MADLIB_SCHEMA._string_to_array(input_group_cols);
group_array_length = array_upper(group_cols, 1);
input_table_name = regexp_split_to_array(source_table, E'\\.');
IF array_upper(input_table_name, 1) = 1 THEN
actual_table_name = input_table_name[1];
schema_name := current_schema();
ELSIF array_upper(input_table_name, 1) = 2 THEN
actual_table_name = input_table_name[2];
schema_name = input_table_name[1];
ELSE
RAISE EXCEPTION 'Incorrect input source table name provided';
END IF;
-- Check that each grouping column exists
FOR each_group in 1 .. group_array_length
LOOP
if NOT MADLIB_SCHEMA.check_if_col_exists(source_table,
group_cols[each_group]) THEN
RAISE EXCEPTION 'Grouping column % does not exist',
group_cols[each_group];
END IF;
END LOOP;
table_creation_string := 'CREATE TABLE ' || out_table || '(';
FOR each_group in 1 .. group_array_length
LOOP
-- create a string that makes list of
EXECUTE 'SELECT data_type FROM information_schema.columns
WHERE
table_schema = ''' || schema_name || '''
AND table_name = ''' || actual_table_name || '''
AND column_name= ''' || group_cols[each_group] || ''''
INTO col_data_type;
table_creation_string := table_creation_string
|| group_cols[each_group]
|| ' ' || col_data_type || ',';
END LOOP;
-- finish creating the output table
EXECUTE table_creation_string || '
coef DOUBLE PRECISION[],
std_err DOUBLE PRECISION[],
z_stats DOUBLE PRECISION[],
p_values DOUBLE PRECISION[])';
-- Step 2: Perform the regression on groups
-------------------------------------------------------------------------
temp_table := MADLIB_SCHEMA.__unique_string();
PERFORM MADLIB_SCHEMA.logregr_train(source_table,
temp_table,
dependent_varname,
independent_varname,
input_group_cols,
max_iter,
optimizer,
tolerance,
print_warnings);
-- Note: We need to alter the names on the output tables for logregr_train
-- so that they don't clash with the robust_variance_logregr output tables
-- This name changing is required to construct queries when the regressor
-- is also being grouped by. I don't know why anyone would do that but
-- it should be a mathematically valid operation.
FOR each_group in 1 .. group_array_length
LOOP
EXECUTE 'ALTER TABLE ' || temp_table || ' RENAME COLUMN ' ||
group_cols[each_group] || ' TO log_regr_'
|| group_cols[each_group];
END LOOP;
-- Step 3: Prepare the grouping strings
group_string := '';
FOR each_group in 1 .. (group_array_length-1)
LOOP
group_string := group_string ||
actual_table_name || '.' ||
group_cols[each_group] || ',';
END LOOP;
group_string := group_string ||
actual_table_name || '.' ||
group_cols[group_array_length];
-- Where clause string
where_group_string := '';
FOR each_group in 1 .. (group_array_length-1)
LOOP
where_group_string := where_group_string || '(' ||
temp_table || '.log_regr_' ||
group_cols[each_group] || ' = ' ||
actual_table_name || '.' ||
group_cols[each_group] || ') AND ';
END LOOP;
where_group_string := where_group_string || '(' ||
temp_table || '.log_regr_' ||
group_cols[group_array_length] || ' = ' ||
actual_table_name || '.' ||
group_cols[group_array_length] || ')';
-- Step 4: Robust statistics into the out table
EXECUTE
'INSERT INTO ' || out_table || '
SELECT ' || group_string || ', (result).coef, (result).std_err,
(result).z_stats, (result).p_values
FROM (
SELECT
'|| group_string ||',
MADLIB_SCHEMA.robust_logregr('||
dependent_varname ||','||
independent_varname || ', coef) AS result' ||
' FROM '|| source_table || ',' || temp_table ||
' WHERE '|| where_group_string ||
' GROUP BY '|| group_string || ')' || actual_table_name;
EXECUTE 'DROP TABLE IF EXISTS ' || temp_table;
END IF;
--Restore message settings
EXECUTE 'SET client_min_messages TO '|| old_msg_level;
END;
$$ LANGUAGE plpgsql VOLATILE;
/**
* @brief Robust logistic function subcall
**/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.robust_variance_logregr(
source_table VARCHAR, -- name of input table
out_table VARCHAR, -- name of output table
dependent_variable VARCHAR, -- name of dependent variable
independent_variable VARCHAR, -- name of independent variable
input_group_cols VARCHAR -- grouping columns
)
RETURNS VOID AS $$
BEGIN
PERFORM MADLIB_SCHEMA.robust_variance_logregr(
source_table,
out_table,
dependent_variable,
independent_variable,
input_group_cols,
20,
'irls',
0.0001,
FALSE);
END;
$$ LANGUAGE plpgsql VOLATILE;
/**
* @brief Robust logistic regression with default fit regression behavior, and no grouping,
**/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.robust_variance_logregr(
source_table VARCHAR -- name of input table
,out_table VARCHAR -- name of output table
,dependent_variable VARCHAR -- name of dependent variable
,independent_variable VARCHAR -- name of independent variable
)
RETURNS VOID AS $$
BEGIN
PERFORM MADLIB_SCHEMA.robust_variance_logregr(
source_table,
out_table,
dependent_variable,
independent_variable,
NULL,
20,
'irls',
0.0001,
FALSE);
END;
$$ LANGUAGE plpgsql VOLATILE;
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.robust_variance_logregr(
source_table VARCHAR -- name of input table
, out_table VARCHAR -- name of output table
, dependent_variable VARCHAR -- name of dependent variable
, independent_variable VARCHAR -- name of independent variable
, input_group_cols VARCHAR -- grouping columns
, max_iter INTEGER -- max number of iterations
)
RETURNS VOID AS $$
BEGIN
PERFORM MADLIB_SCHEMA.robust_variance_logregr(
source_table,
out_table,
dependent_variable,
independent_variable,
input_group_cols,
max_iter,
'irls',
0.0001,
FALSE);
END;
$$ LANGUAGE plpgsql VOLATILE;
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.robust_variance_logregr(
source_table VARCHAR -- name of input table
, out_table VARCHAR -- name of output table
, dependent_variable VARCHAR -- name of dependent variable
, independent_variable VARCHAR -- name of independent variable
, input_group_cols VARCHAR -- grouping columns
, max_iter INTEGER -- max number of iterations
, optimizer VARCHAR -- The optimizer used in the robust regression
)
RETURNS VOID AS $$
BEGIN
PERFORM MADLIB_SCHEMA.robust_variance_logregr(
source_table,
out_table,
dependent_variable,
independent_variable,
input_group_cols,
max_iter,
optimizer,
0.0001,
FALSE);
END;
$$ LANGUAGE plpgsql VOLATILE;
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.robust_variance_logregr(
source_table VARCHAR -- name of input table
, out_table VARCHAR -- name of output table
, dependent_variable VARCHAR -- name of dependent variable
, independent_variable VARCHAR -- name of independent variable
, input_group_cols VARCHAR -- grouping columns
, max_iter INTEGER -- max number of iterations
, optimizer VARCHAR -- The optimizer used in the robust regression
, tolerance DOUBLE PRECISION -- The tolerance of the optimizer
)
RETURNS VOID AS $$
BEGIN
PERFORM MADLIB_SCHEMA.robust_variance_logregr(
source_table,
out_table,
dependent_variable,
independent_variable,
input_group_cols,
max_iter,
optimizer,
tolerance,
FALSE);
END;
$$ LANGUAGE plpgsql VOLATILE;
--------------------------- ROBUST MULTINOMIAL LOGISTIC REGRESSION -------------------------
CREATE TYPE MADLIB_SCHEMA.robust_mlogregr_result AS (
ref_category INTEGER,
coef DOUBLE PRECISION[],
std_err DOUBLE PRECISION[],
z_stats DOUBLE PRECISION[],
p_values DOUBLE PRECISION[]
);
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlogregr_robust_step_transition
(
state DOUBLE PRECISION[],
y INTEGER,
numCategories INTEGER,
ref_category INTEGER,
x DOUBLE PRECISION[],
coef DOUBLE PRECISION[]
)
RETURNS DOUBLE PRECISION[]
AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE;
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlogregr_robust_step_merge_states
(
state1 DOUBLE PRECISION[],
state2 DOUBLE PRECISION[]
)
RETURNS DOUBLE PRECISION[]
AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE STRICT;
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlogregr_robust_step_final
(
state DOUBLE PRECISION[]
)
RETURNS MADLIB_SCHEMA.robust_mlogregr_result
AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE STRICT;
/**
* @brief Compute robust regression diagnostic statistics for multinomial logistic regression.
*
* @param source_table name of input table, VARCHAR
* @param out_table name of output table, VARCHAR
* @param dependent_varname dependent variable, VARCHAR
* @param independent_varname independent variables, VARCHAR
* @param ref_category Integer specifying the reference category. Default is 0.
* @param grouping_cols grouping variables, VARCHAR. Default is NULL. Currently a placeholder.
* @param max_iter Integer identifying the maximum iterations used by the logistic regression solver. Default is 20.
* @param optimizer String identifying the optimizer used in the multinomial logistic regression. See the documentation in the multinomial logistic regression for the available options. Default is 'irls'.
* @param tolerance Float identifying the tolerance of the multinomial logistic regression optimizer. Default is 0.0001.
* @param print_warnings Boolean specifying if the multinomial logistic regression solver should print any warnings. Currently a placeholder.
*
*
* @par
* To include an intercept in the model, set one coordinate in the
* <tt>independentVariables</tt> array to 1.
*
* @usage
* <pre>
* SELECT madlib.robust_variance_mlogregr(
* <em>'source_table'</em>, -- name of input table, VARCHAR
* <em>'out_table'</em>, -- name of output table, VARCHAR
* <em>'dependent_varname'</em>, -- dependent variable, VARCHAR
* <em>'independent_varname'</em>, -- independent variables, VARCHAR
* <em>ref_category</em>, -- [OPTIONAL] Integer specifying the reference category. Default is 0.
* <em>'grouping_cols'</em>, -- [OPTIONAL] grouping variables, VARCHAR. Default is NULL.
* <em>max_iter</em>, -- [OPTIONAL] Integer identifying the maximum iterations used by the logistic regression solver. Default is 20.
* <em>'optimizer'</em>, -- [OPTIONAL] String identifying the optimizer used in the multinomial logistic regression. See the documentation in the multinomial logistic regression for the available options. Default is irls.
* <em>tolerance</em>, -- [OPTIONAL] Float identifying the tolerance of the multinomial logistic regression optimizer. Default is 0.0001.
* <em>print_warnings</em> -- [OPTIONAL] Boolean specifying if the regression fit should print any warning messages. Default is false.
* );
* </pre>
*
* @return A composite value:
* - <tt>ref_category INTEGER</tt> - The reference category
* - <tt>coef FLOAT8[] </tt> - The coefficients for the regression
* - <tt>std_err FLOAT8[]</tt> - Array of huber-white standard errors,
* - <tt>z_stats FLOAT8[]</tt> - Array of Wald z-statistics,
* - <tt>p_values FLOAT8[]</tt> - Array of p-values,
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.robust_variance_mlogregr(
source_table VARCHAR, -- name of input table
out_table VARCHAR, -- name of output table
dependent_varname VARCHAR, -- name of dependent variable
independent_varname VARCHAR, -- name of independent variable
ref_category INTEGER, -- reference category (default 0)
input_group_cols VARCHAR, -- grouping columns (PLACEHOLDER) (default NULL)
max_iter INTEGER, -- max number of itertions (default 20)
optimizer VARCHAR, -- optimizer ['irls', 'newton'] (default 'irls')
tolerance DOUBLE PRECISION, -- tolerance (default 0.0001)
print_warnings BOOLEAN -- PLACEHOLDER (default False)
)
RETURNS VOID AS $$
DECLARE
num_category INTEGER;
regr_coef DOUBLE PRECISION[];
robust_value MADLIB_SCHEMA.robust_mlogregr_result;
insert_string VARCHAR;
old_msg_level VARCHAR;
BEGIN
EXECUTE 'SELECT setting FROM pg_settings WHERE name=''client_min_messages''' INTO old_msg_level;
EXECUTE 'SET client_min_messages TO warning';
PERFORM MADLIB_SCHEMA.robust_input_checking(source_table, out_table, dependent_varname, independent_varname);
-- Run the regression if the coefficients are not provided
-- Will do further parameter checking in the regression function
EXECUTE
$sql$SELECT coef FROM MADLIB_SCHEMA.mlogregr('$sql$
|| source_table || $sql$', '$sql$
|| dependent_varname || $sql$', '$sql$
|| independent_varname || $sql$', $sql$
|| max_iter || $sql$, '$sql$
|| optimizer || $sql$', $sql$
|| tolerance || ','
|| ref_category || ')'
INTO regr_coef;
EXECUTE
'SELECT count(DISTINCT ' || dependent_varname|| ') FROM ' ||
textin(regclassout(source_table))
INTO num_category;
-- compute robust variance calculation
--robust_log_rst := MADLIB_SCHEMA.__internal_get_robust_mlogregr_result(
-- source_table, dependent_varname, num_category, ref_category,
-- independent_varname, regr_coef);
EXECUTE
'SELECT (MADLIB_SCHEMA.robust_mlogregr('
|| dependent_varname || ', '
|| num_category || ', '
|| ref_category || ', '
|| independent_varname || ', '
|| 'ARRAY[' || array_to_string(regr_coef, ',') ||
'])).* FROM ' || source_table
INTO robust_value;
insert_string :=
MADLIB_SCHEMA.__internal_get_robust_mlogregr_insert_string(
robust_value, out_table);
-- Ensure Infinity and NaN are cast properly
insert_string := REGEXP_REPLACE(
insert_string, 'Infinity', '''Infinity''::double precision', 'gi');
insert_string := REGEXP_REPLACE(
insert_string, 'NaN', '''NaN''::double precision', 'gi');
-- create output table with appropriate column names
EXECUTE
'CREATE TABLE ' || out_table || ' (
ref_category INTEGER,
coef DOUBLE PRECISION[],
std_err DOUBLE PRECISION[],
z_stats DOUBLE PRECISION[],
p_values DOUBLE PRECISION[])';
-- complete the sql string and execute
EXECUTE insert_string || ')';
--Restore message settings
EXECUTE 'SET client_min_messages TO '|| old_msg_level;
END;
$$ LANGUAGE plpgsql VOLATILE;
/**
* @brief Robust multinomial logistic function subcall
* @param dependentVariable Column containing the dependent variable
* @param numCategories Number of categories in the dependent variable
* @param ref_category Reference category for modeling
* @param independentVariables Column containing the array of independent variables
* @param coef Column containing the array of the coefficients (as obtained by mlogregr)
* @usage
* <pre>
* SELECT robust_mlogregr(
* <em>dependentVariable</em>,
* <em>numCategories</em>,
* <em>ref_category</em>,
* <em>independentVariables</em>,
* <em>coef</em>)
* FROM <em>dataTable</em>;
* </pre>
* @return A composite value:
* - <tt>ref_category INTEGER</tt> - The reference category
* - <tt>coef FLOAT8[] </tt> - The coefficients for the regression
* - <tt>std_err FLOAT8[]</tt> - Array of huber-white standard errors,
* \f$ \mathit{se}(c_1), \dots, \mathit{se}(c_k) \f$
* - <tt>z_stats FLOAT8[]</tt> - Array of Wald z-statistics, \f$ \boldsymbol t \f$
* - <tt>p_values FLOAT8[]</tt> - Array of p-values, \f$ \boldsymbol p \f$
*
*/
CREATE AGGREGATE MADLIB_SCHEMA.robust_mlogregr(
/*+ "dependentVariable" */ INTEGER,
/*+ "numCategories" */ INTEGER,
/*+ "ref_category" */ INTEGER,
/*+ "independentVariables" */ DOUBLE PRECISION[],
/*+ "coef" */ DOUBLE PRECISION[]
)
(
STYPE=DOUBLE PRECISION[],
SFUNC=MADLIB_SCHEMA.mlogregr_robust_step_transition,
m4_ifdef(`GREENPLUM',`prefunc=MADLIB_SCHEMA.mlogregr_robust_step_merge_states,')
FINALFUNC=MADLIB_SCHEMA.mlogregr_robust_step_final,
INITCOND='{0,0,0,0,0}'
);
--------------------------- INTERNAL ---------------------------------------
/**
* @brief Return insert string for robust multinomial logistic regression
**/
CREATE FUNCTION MADLIB_SCHEMA.__internal_get_robust_mlogregr_insert_string(
robust_mlog_rst MADLIB_SCHEMA.robust_mlogregr_result,
out_table TEXT
)
RETURNS VARCHAR AS $$
DECLARE
insert_string VARCHAR;
BEGIN
insert_string := 'INSERT INTO ' || out_table || ' VALUES (';
insert_string := insert_string ||
(robust_mlog_rst).ref_category || ', ' ||
CASE
WHEN (robust_mlog_rst).coef is NULL
THEN '''{}'','
ELSE 'ARRAY[' || array_to_string((robust_mlog_rst).coef, ',') || '], '
END ||
CASE
WHEN (robust_mlog_rst).std_err is NULL
THEN '''{}'','
ELSE 'ARRAY[' || array_to_string((robust_mlog_rst).std_err, ',') || '], '
END ||
CASE
WHEN (robust_mlog_rst).z_stats is NULL
THEN '''{}'','
ELSE 'ARRAY[' || array_to_string((robust_mlog_rst).z_stats, ',') || '], '
END ||
CASE
WHEN (robust_mlog_rst).p_values is NULL
THEN '''{}'','
ELSE 'ARRAY[' || array_to_string((robust_mlog_rst).p_values, ',') || '] '
END;
RETURN insert_string;
END;
$$ LANGUAGE plpgsql VOLATILE;
--------------------------- Interface ----------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.robust_variance_mlogregr(
usage_string VARCHAR -- usage string
)
RETURNS VARCHAR AS $$
DECLARE
insert_string VARCHAR;
BEGIN
IF (usage_string = '' OR usage_string = 'help' OR usage_string = '?') THEN
insert_string := '' ||
E'Summary \n' ||
E'-----------------------------------------------------------------------------------------\n' ||
E' Functionality: Calculate Huber-White robust statistics for multinomial logistic regression\n' ||
E' The funciton first runs the regression to calculate the \n' ||
E' coefficients and uses them to calculate the robust statistics \n' ||
E' SELECT {schema_madlib}.robust_variance_mlogregr(''source_table'' \n' ||
E' ,''output_table'' \n' ||
E' ,''dependent_variable'' \n' ||
E' ,''independent_variable'' \n' ||
E' ,''reference_category'' \n' ||
E' ,''group_cols'' \n' ||
E' ,''max_iter'' \n' ||
E' ,''optimizer'' \n' ||
E' ,''tolerance'' \n' ||
E' ,''verbose'' \n' ||
E' );\n' ||
E'For more details on function usage: \n' ||
E'SELECT {schema_madlib}.robust_variance_mlogregr(''usage'') \n';
ElSIF (usage_string = 'usage') THEN
insert_string := '' ||
E'Usage\n' ||
E'-----------------------------------------------------------------------------------------\n' ||
E' To use this function \n' ||
E' SELECT {schema_madlib}.robust_variance_mlogregr( \n' ||
E' ''source_table'', -- Name of data table\n' ||
E' ''output_table'', -- Name of result table\n' ||
E' ''dependent_variable'', -- Name of column for dependent variables\n' ||
E' ''independent_variable'', -- Name of column for independent variables\n' ||
E' (can be any SQL expression that evaluates to an array) \n' ||
E' ''reference_category'', -- [OPTIONAL] Reference category. Default is 0. \n' ||
E' ''group_cols'', -- [OPTIONAL] Comma separated string with columns to group by. Default is NULL. \n' ||
E' ''max_iter'', -- [OPTIONAL] The number of iterations used by the logistic regression solver. Default is 20. \n' ||
E' ''optimizer'', -- [OPTIONAL] Name of the optimizer used in the logistic regression. Default is irls. \n' ||
E' ''tolerance'', -- [OPTIONAL] The tolerance of the logistic regression optimizer. Default is 0.0001. \n' ||
E' ''verbose'' -- [OPTIONAL] Should the optimizer print warning messages to the screen. Default is FALSE. \n' ||
E' );\n' ||
E'\n' ||
E'Output:\n' ||
E'-----------------------------------------------------------------------------------------\n' ||
E' The output table (''output_table'' above) has the following columns\n' ||
E' ''ref_category" INTEGER, -- Reference category\n' ||
E' ''coef'' DOUBLE PRECISION[], -- Coefficients of regression \n' ||
E' ''std_err'' DOUBLE PRECISION[], -- Huber-White standard errors\n' ||
E' ''z_stats'' DOUBLE PRECISION[], -- Z-stats of the standard errors\n' ||
E' ''p_values'' DOUBLE PRECISION[], -- p-values of the standard errors\n' ||
E'\n' ||
E'';
ELSE
insert_string := 'No such option. Run SELECT {schema_madlib}.robust_variance_mlogregr()';
END IF;
RETURN insert_string;
END;
$$ LANGUAGE plpgsql VOLATILE;
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.robust_variance_mlogregr()
RETURNS VARCHAR AS $$
BEGIN
RETURN MADLIB_SCHEMA.robust_variance_mlogregr('');
END;
$$ LANGUAGE plpgsql VOLATILE;
--------------------------- Robust Multinomial Logistic Regression ----------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.robust_variance_mlogregr(
source_table VARCHAR,
out_table VARCHAR,
dependent_varname VARCHAR,
independent_varname VARCHAR,
ref_category INTEGER,
input_group_cols VARCHAR,
max_iter INTEGER,
optimizer VARCHAR,
tolerance DOUBLE PRECISION
)
RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.robust_variance_mlogregr($1, $2, $3, $4, $5, $6, $7,
$8, $9, FALSE);
$$ LANGUAGE sql;
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.robust_variance_mlogregr(
source_table VARCHAR,
out_table VARCHAR,
dependent_varname VARCHAR,
independent_varname VARCHAR,
ref_category INTEGER,
input_group_cols VARCHAR,
max_iter INTEGER,
optimizer VARCHAR
)
RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.robust_variance_mlogregr($1, $2, $3, $4, $5, $6, $7,
$8, 0.0001, FALSE);
$$ LANGUAGE sql;
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.robust_variance_mlogregr(
source_table VARCHAR,
out_table VARCHAR,
dependent_varname VARCHAR,
independent_varname VARCHAR,
ref_category INTEGER,
input_group_cols VARCHAR,
max_iter INTEGER
)
RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.robust_variance_mlogregr($1, $2, $3, $4, $5, $6, $7,
'irls', 0.0001, FALSE);
$$ LANGUAGE sql;
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.robust_variance_mlogregr(
source_table VARCHAR,
out_table VARCHAR,
dependent_varname VARCHAR,
independent_varname VARCHAR,
ref_category INTEGER,
input_group_cols VARCHAR
)
RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.robust_variance_mlogregr($1, $2, $3, $4, $5, $6, 20,
'irls', 0.0001, FALSE);
$$ LANGUAGE sql;
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.robust_variance_mlogregr(
source_table VARCHAR,
out_table VARCHAR,
dependent_varname VARCHAR,
independent_varname VARCHAR,
ref_category INTEGER
)
RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.robust_variance_mlogregr($1, $2, $3, $4, $5, NULL, 20,
'irls', 0.0001, FALSE);
$$ LANGUAGE sql;
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.robust_variance_mlogregr(
source_table VARCHAR,
out_table VARCHAR,
dependent_varname VARCHAR,
independent_varname VARCHAR
)
RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.robust_variance_mlogregr($1, $2, $3, $4, 0, NULL, 20,
'irls', 0.0001, FALSE);
$$ LANGUAGE sql;