blob: 8f23e228014cd301134c2f4953658e998fbd6a97 [file] [log] [blame]
m4_include(`SQLCommon.m4')
/* -----------------------------------------------------------------------------
* Test Linear Regression.
* -------------------------------------------------------------------------- */
/*
* The following example is taken from:
* http://www.weibull.com/DOEWeb/estimating_regression_models_using_least_squares.htm#Example%205.1
* http://www.weibull.com/DOEWeb/hypothesis_tests_in_multiple_linear_regression.htm#Example%205.3
*/
DROP TABLE IF EXISTS weibull;
CREATE TABLE weibull (
id INTEGER NOT NULL,
x1 DOUBLE PRECISION,
x2 DOUBLE PRECISION,
y DOUBLE PRECISION
) m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (id)');
/*
* We materialize the table here because on MPP systems we want to distribute
* the data to the segments (and not do the calculations in memory).
*/
INSERT INTO weibull(id, x1, x2, y) VALUES
( 1, 41.9, 29.1, 251.3),
( 2, 43.4, 29.3, 251.3),
( 3, 43.9, 29.5, 248.3),
( 4, 44.5, 29.7, 267.5),
( 5, 47.3, 29.9, 273.0),
( 6, 47.5, 30.3, 276.5),
( 7, 47.9, 30.5, 270.3),
( 8, 50.2, 30.7, 274.9),
( 9, 52.8, 30.8, 285.0),
(10, 53.2, 30.9, 290.0),
(11, 56.7, 31.5, 297.0),
(12, 57.0, 31.7, 302.5),
(13, 63.5, 31.9, 304.5),
(14, 65.3, 32.0, 309.3),
(15, 71.1, 32.1, 321.7),
(16, 77.0, 32.5, 330.7),
(17, 77.8, 32.9, 349.0);
--------------------- Robust linear regression install checks -----------
SELECT * FROM (
SELECT (robust_linregr(y, ARRAY[1, x1, x2], coef)).*
FROM
( SELECT (linregr(y, ARRAY[1, x1, x2])).coef
FROM weibull
) as mle_coef,
weibull as src
) q;
DROP TABLE IF EXISTS result_weibull_3;
DROP TABLE IF EXISTS result_weibull_3_summary;
select robust_variance_linregr('weibull', 'result_weibull_3', 'y', 'ARRAY[1, x1, x2]');
-- End of robust linear regression
/* Robust variance test
* The following example is uses the coefficients computed in the above
* example, and checks that the robust variance is correct.
* The correct values were checked with R. For completeness, the following
* R code segment can be used to reproduce the correct values. This code segment
* expects the data table 'patients' to be in a file called "patients.txt".
* the data file should look exactly like:
second_attack, treatment, trait_anxiety
1, 1, 70
1, 1, 80
1, 1, 50
1, 0, 60
1, 0, 40
1, 0, 65
1, 0, 75
1, 0, 80
1, 0, 70
1, 0, 60
0, 1, 65
0, 1, 50
0, 1, 45
0, 1, 35
0, 1, 40
0, 1, 50
0, 0, 55
0, 0, 45
0, 0, 50
0, 0, 60
* BEGIN CODE:
data <- read.csv("patients.txt")
regress <- glm( second_attack ~ treatment + trait_anxiety, family=binomial("logit"), data = data)
library(sandwich)
vcovHC(regress, 'HC0')
* END CODE
*/
DROP TABLE IF EXISTS patients;
CREATE TABLE patients (
id INTEGER NOT NULL,
second_attack INTEGER,
treatment INTEGER,
trait_anxiety INTEGER
) m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (id)');
INSERT INTO patients(ID, second_attack, treatment, trait_anxiety) VALUES
( 1, 1, 1, 70),
( 2, 1, 1, 80),
( 3, 1, 1, 50),
( 4, 1, 0, 60),
( 5, 1, 0, 40),
( 6, 1, 0, 65),
( 7, 1, 0, 75),
( 8, 1, 0, 80),
( 9, 1, 0, 70),
(10, 1, 0, 60),
(11, 0, 1, 65),
(12, 0, 1, 50),
(13, 0, 1, 45),
(14, 0, 1, 35),
(15, 0, 1, 40),
(16, 0, 1, 50),
(17, 0, 0, 55),
(18, 0, 0, 45),
(19, 0, 0, 50),
(20, 0, 0, 60);
drop table if exists temp_result;
drop table if exists temp_result_summary;
select robust_variance_logregr('patients', 'temp_result', 'second_attack', 'ARRAY[1, treatment, trait_anxiety]');
SELECT assert(
relative_error(coef, ARRAY[-6.36346, -1.024, 0.11904]) < 1e-3 AND
relative_error(std_err, ARRAY[3.4587,1.171,0.05343]) < 1e-3 AND
relative_error(z_stats, ARRAY[-1.8398,-0.87409,2.2279]) < 1e-3 AND
relative_error(p_values, ARRAY[0.06579,0.38206,0.025884]) < 1e-3,
'Robust Variance: Wrong Results'
)FROM temp_result;
drop table if exists temp_result;
drop table if exists temp_result_summary;
select robust_variance_logregr('patients', 'temp_result', 'second_attack', 'ARRAY[1, treatment, trait_anxiety]', NULL );
drop table if exists temp_result;
drop table if exists temp_result_summary;
select robust_variance_logregr('patients', 'temp_result', 'second_attack', 'ARRAY[1, treatment, trait_anxiety]', NULL, 20 );
drop table if exists temp_result;
drop table if exists temp_result_summary;
select robust_variance_logregr('patients', 'temp_result', 'second_attack', 'ARRAY[1, treatment, trait_anxiety]', NULL, 20 , 'irls');
drop table if exists temp_result;
drop table if exists temp_result_summary;
select robust_variance_logregr('patients', 'temp_result', 'second_attack', 'ARRAY[1, treatment, trait_anxiety]', NULL, 20, 'irls', 0.0001 );
drop table if exists temp_result;
drop table if exists temp_result_summary;
select robust_variance_logregr('patients', 'temp_result', 'second_attack', 'ARRAY[1, treatment, trait_anxiety]', NULL, 20, 'irls', 0.0001, FALSE );
-------------------------------------------------------------------------
DROP TABLE IF EXISTS test_hw_mlogr;
CREATE TABLE test_hw_mlogr(
feat1 INTEGER,
feat2 DOUBLE PRECISION,
cat INTEGER
);
INSERT INTO test_hw_mlogr(feat1, feat2, cat) VALUES
(1,0.35,1),
(2,0.33,0),
(3,0.39,1),
(1,0.37,1),
(2,0.31,1),
(3,0.36,0),
(2,0.36,1),
(2,0.31,1),
(2,0.41,1),
(2,0.37,1),
(1,0.44,1),
(3,0.33,2),
(1,0.31,1),
(2,0.44,1),
(1,0.35,1),
(1,0.44,0),
(1,0.46,0),
(2,0.46,1),
(2,0.46,2),
(3,0.49,1),
(2,0.39,0),
(2,0.44,1),
(1,0.47,1),
(1,0.44,1),
(1,0.37,2),
(3,0.38,2),
(1,0.49,0),
(2,0.44,0),
(1,0.41,2),
(1,0.5,2),
(2,0.44,0),
(1,0.39,1),
(1,0.4,2),
(1,0.46,2),
(2,0.41,1),
(2,0.39,1),
(2,0.33,1),
(3,0.59,2),
(1,0.41,0),
(2,0.47,2),
(2,0.31,0),
(3,0.42,2),
(1,0.55,2),
(3,0.4,1),
(1,0.44,2),
(1,0.54,1),
(2,0.46,1),
(1,0.54,0),
(2,0.42,1),
(2,0.49,2),
(2,0.41,2),
(2,0.41,1),
(1,0.44,0),
(1,0.57,2),
(2,0.52,2),
(1,0.49,0),
(3,0.41,2),
(3,0.57,0),
(1,0.62,1),
(3,0.33,0),
(2,0.54,1),
(2,0.4,2),
(3,0.52,2),
(2,0.57,1),
(2,0.49,1),
(2,0.46,1),
(1,0.57,0),
(2,0.49,2),
(2,0.52,2),
(2,0.53,0),
(3,0.54,2),
(2,0.57,2),
(3,0.41,2),
(1,0.52,0),
(2,0.57,1),
(1,0.54,0),
(2,0.52,1),
(2,0.52,0),
(2,0.44,0),
(2,0.46,2),
(1,0.49,1),
(2,0.54,2),
(3,0.52,2),
(1,0.44,0),
(3,0.49,1),
(1,0.46,2),
(2,0.54,0),
(2,0.39,0),
(2,0.59,0),
(2,0.45,1),
(3,0.52,1),
(3,0.54,0),
(3,0.44,1),
(2,0.5,2),
(2,0.62,1),
(2,0.59,0),
(2,0.52,2),
(2,0.52,1),
(2,0.46,1),
(2,0.41,0),
(2,0.52,2),
(2,0.52,1),
(2,0.55,1),
(2,0.41,1),
(2,0.49,0),
(1,0.59,2),
(1,0.54,0),
(2,0.54,0),
(2,0.59,2),
(2,0.55,2),
(1,0.62,2),
(2,0.54,2),
(2,0.54,2),
(2,0.54,2),
(2,0.59,2),
(2,0.57,1),
(3,0.61,2),
(3,0.52,2),
(2,0.59,2),
(2,0.62,2),
(1,0.6,1),
(2,0.59,2),
(2,0.65,2),
(3,0.61,2),
(2,0.59,2),
(3,0.59,2),
(2,0.59,2),
(2,0.59,2),
(2,0.65,2),
(3,0.57,2),
(2,0.59,2),
(3,0.49,2),
(1,0.49,0),
(3,0.59,2),
(2,0.62,2),
(3,0.59,0),
(2,0.54,2),
(3,0.63,2),
(1,0.43,2),
(3,0.54,2),
(3,0.52,2),
(1,0.57,2),
(2,0.57,0),
(2,0.57,0),
(2,0.61,2),
(2,0.62,0),
(2,0.62,0),
(1,0.65,0),
(2,0.57,2),
(3,0.59,2),
(2,0.59,2),
(3,0.62,2),
(2,0.65,2),
(2,0.62,1),
(1,0.62,0),
(2,0.62,2),
(3,0.54,2),
(3,0.62,2),
(1,0.65,2),
(3,0.62,2),
(3,0.67,0),
(3,0.65,0),
(1,0.6,2),
(3,0.59,2),
(2,0.59,2),
(2,0.59,1),
(3,0.65,0),
(3,0.62,2),
(3,0.65,2),
(3,0.59,0),
(1,0.59,0),
(3,0.61,2),
(1,0.65,2),
(3,0.67,1),
(3,0.65,2),
(1,0.65,2),
(2,0.67,2),
(1,0.65,2),
(1,0.62,2),
(3,0.52,2),
(3,0.63,2),
(2,0.59,2),
(3,0.65,2),
(2,0.59,0),
(3,0.67,2),
(3,0.67,2),
(3,0.6,2),
(3,0.67,2),
(3,0.62,2),
(2,0.54,2),
(3,0.65,2),
(3,0.62,2),
(2,0.59,2),
(3,0.6,2),
(3,0.63,2),
(3,0.65,2),
(2,0.63,1),
(2,0.67,2),
(2,0.65,2),
(2,0.62,2);
-- Checks for the function usage
select robust_variance_mlogregr('help');
select robust_variance_mlogregr('');
select robust_variance_mlogregr('random_test');
select robust_variance_mlogregr('usage');
DROP TABLE IF EXISTS test_hw_mlogr_result;
DROP TABLE IF EXISTS test_hw_mlogr_result_summary;
SELECT robust_variance_mlogregr(
'test_hw_mlogr', 'test_hw_mlogr_result',
'cat', 'ARRAY[1, feat1, feat2]', 0, NULL, 20, 'irls', 0.0001, False);
SELECT * FROM test_hw_mlogr_result;
SELECT assert(relative_error(
std_err, ARRAY[1.20, 0.29, 2.36]) < 1e-1,
'Robust Variance: Wrong Results')
from test_hw_mlogr_result
where category = 1;
SELECT assert(relative_error(
std_err, ARRAY[1.28, 0.28, 2.18]) < 1e-1,
'Robust Variance: Wrong Results')
from test_hw_mlogr_result
where category = 2;
DROP TABLE IF EXISTS test_hw_mlogr_result;
DROP TABLE IF EXISTS test_hw_mlogr_result_summary;
SELECT robust_variance_mlogregr(
'test_hw_mlogr', 'test_hw_mlogr_result',
'cat', 'ARRAY[1, feat1, feat2]', 0, NULL,
'max_iter=20, optimizer=irls, tolerance=0.0001',
False);
DROP TABLE IF EXISTS test_hw_mlogr_result;
DROP TABLE IF EXISTS test_hw_mlogr_result_summary;
SELECT robust_variance_mlogregr(
'test_hw_mlogr', 'test_hw_mlogr_result',
'cat', 'ARRAY[1, feat1, feat2]', 0, NULL,
'max_num_iterations=20, optimizer=irls, precision=0.0001',
False);
-- NULL handling test
DROP TABLE IF EXISTS weibull_with_null;
DROP TABLE IF EXISTS test_hw_mlogr_result_summary;
CREATE TABLE weibull_with_null AS SELECT * FROM weibull;
INSERT INTO weibull_with_null(id, x1, x2, y) VALUES
(18, NULL, 29.1, 251.3),
(19, 43.4, NULL, 251.3),
(20, 43.9, 29.5, NULL);
DROP TABLE IF EXISTS temp_result;
DROP TABLE IF EXISTS temp_result_summary;
SELECT robust_variance_linregr('weibull_with_null', 'temp_result', 'y',
'ARRAY[1, x1, x2]');
DROP TABLE IF EXISTS patients_with_null;
CREATE TABLE patients_with_null AS SELECT * FROM patients;
INSERT INTO patients_with_null(ID, second_attack, treatment, trait_anxiety) VALUES
(21, NULL, 1, 70),
(22, 1, NULL, 80),
(23, 1, 1, NULL);
DROP TABLE IF EXISTS temp_result;
DROP TABLE IF EXISTS temp_result_summary;
SELECT robust_variance_logregr('patients_with_null', 'temp_result', 'second_attack', 'ARRAY[1, treatment, trait_anxiety]');
DROP TABLE IF EXISTS test_hw_mlogr_null_value;
CREATE TABLE test_hw_mlogr_null_value AS SELECT * FROM test_hw_mlogr;
INSERT INTO test_hw_mlogr(feat1, feat2, cat) VALUES
(NULL,0.67,2),
(2,NULL,2),
(2,0.62,NULL);
DROP TABLE IF EXISTS temp_result;
DROP TABLE IF EXISTS temp_result_summary;
SELECT robust_variance_mlogregr(
'test_hw_mlogr_null_value', 'temp_result',
'cat', 'ARRAY[1, feat1, feat2]', 0, NULL,
'max_iter=20, optimizer=irls, tolerance=0.0001', False);
SELECT assert(relative_error(
std_err, ARRAY[1.20, 0.29, 2.36]) < 1e-1,
'Robust Variance: Wrong Results')
from temp_result
where category = 1;
SELECT assert(relative_error(
std_err, ARRAY[1.28, 0.28, 2.18]) < 1e-1,
'Robust Variance: Wrong Results')
from temp_result
where category = 2;