blob: bf81866f9587a57de2c457cb1fbf5c73773b68e1 [file] [log] [blame]
m4_include(`SQLCommon.m4')
/* -----------------------------------------------------------------------------
* Test Linear Regression.
* -------------------------------------------------------------------------- */
/*
* The following example is taken from:
* http://www.weibull.com/DOEWeb/estimating_regression_models_using_least_squares.htm#Example%205.1
* http://www.weibull.com/DOEWeb/hypothesis_tests_in_multiple_linear_regression.htm#Example%205.3
*/
DROP TABLE IF EXISTS weibull;
CREATE TABLE weibull (
id INTEGER NOT NULL,
x1 DOUBLE PRECISION,
x2 DOUBLE PRECISION,
y DOUBLE PRECISION
m4_ifdef(`__HAWQ__', `', `
, CONSTRAINT pk_weibull PRIMARY KEY (id)
')
) m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (id)');
/*
* We materialize the table here because on MPP systems we want to distribute
* the data to the segments (and not do the calculations in memory).
*/
INSERT INTO weibull(id, x1, x2, y) VALUES
( 1, 41.9, 29.1, 251.3),
( 2, 43.4, 29.3, 251.3),
( 3, 43.9, 29.5, 248.3),
( 4, 44.5, 29.7, 267.5),
( 5, 47.3, 29.9, 273.0),
( 6, 47.5, 30.3, 276.5),
( 7, 47.9, 30.5, 270.3),
( 8, 50.2, 30.7, 274.9),
( 9, 52.8, 30.8, 285.0),
(10, 53.2, 30.9, 290.0),
(11, 56.7, 31.5, 297.0),
(12, 57.0, 31.7, 302.5),
(13, 63.5, 31.9, 304.5),
(14, 65.3, 32.0, 309.3),
(15, 71.1, 32.1, 321.7),
(16, 77.0, 32.5, 330.7),
(17, 77.8, 32.9, 349.0),
-- Add NULL values
(18, NULL, 32.1, 321.7),
(19, 77.0, NULL, 330.7),
(20, 77.8, 32.9, NULL);
SELECT assert(
relative_error(coef, ARRAY[-153.51, 1.24, 12.08]) < 1e-4 AND
relative_error(t_stats[2], 3.1393) < 1e-4 AND
relative_error(t_stats[3], 3.0726) < 1e-4 AND
relative_error(p_values[3], 0.0083) < 1e-2,
'Linear regression (weibull.com test): Wrong results'
) FROM (
SELECT (linregr(y, ARRAY[1, x1, x2])).*
FROM weibull
) q;
/*
* The following example is taken from:
* http://biocomp.health.unm.edu/biomed505/Course/Cheminformatics/advanced/data_classification_qsar/linear_multilinear_regression.pdf
*/
DROP TABLE IF EXISTS unm;
CREATE TABLE unm (
id INTEGER NOT NULL,
x1 DOUBLE PRECISION,
x2 DOUBLE PRECISION,
y DOUBLE PRECISION
m4_ifdef(`__HAWQ__', `', `
, CONSTRAINT pk_unm PRIMARY KEY (id)
')
);
INSERT INTO unm(id, x1, x2, y) VALUES
(1, 0, 0.30, 10.14),
(2, 0.69, 0.60, 11.93),
(3, 1.10, 0.90, 13.57),
(4, 1.39, 1.20, 14.17),
(5, 1.61, 1.50, 15.25),
(6, 1.79, 1.80, 16.15),
-- Add NULL values
(7, NULL, 1.20, 14.17),
(8, 1.61, NULL, 15.25),
(9, 1.79, 1.80, NULL);
SELECT assert(
relative_error(coef, ARRAY[9.69, 2.09, 1.50]) < 1e-2,
'Linear regression (unm): Wrong coefficients'
) FROM (
SELECT (linregr(y, ARRAY[1, x1, x2])).*
FROM unm
) q;
/*
* The following example is taken from:
* http://www.stat.columbia.edu/~martin/W2110/SAS_7.pdf
*/
DROP TABLE IF EXISTS houses;
CREATE TABLE houses (
id SERIAL NOT NULL,
tax INTEGER,
bedroom REAL,
bath REAL,
price INTEGER,
size INTEGER,
lot INTEGER
m4_ifdef(`__HAWQ__', `', `
, CONSTRAINT pk_houses PRIMARY KEY (id)
')
);
INSERT INTO houses(tax, bedroom, bath, price, size, lot) VALUES
( 590, 2, 1, 50000, 770, 22100),
(1050, 3, 2, 85000, 1410, 12000),
( 20, 3, 1, 22500, 1060, 3500 ),
( 870, 2, 2, 90000, 1300, 17500),
(1320, 3, 2, 133000, 1500, 30000),
(1350, 2, 1, 90500, 820, 25700),
(2790, 3, 2.5, 260000, 2130, 25000),
( 680, 2, 1, 142500, 1170, 22000),
(1840, 3, 2, 160000, 1500, 19000),
(3680, 4, 2, 240000, 2790, 20000),
(1660, 3, 1, 87000, 1030, 17500),
(1620, 3, 2, 118600, 1250, 20000),
(3100, 3, 2, 140000, 1760, 38000),
(2070, 2, 3, 148000, 1550, 14000),
( 650, 3, 1.5, 65000, 1450, 12000);
-- Values computed with MADlib
SELECT assert(
relative_error(coef, ARRAY[27923.43, -35524.78, 2269.34, 130.79]) < 1e-4 AND
relative_error(r2, 0.74537) < 1e-3 AND
relative_error(std_err, ARRAY[56306, 25037, 22208, 36.209]) < 1e-4 AND
relative_error(t_stats, ARRAY[0.49592, -1.4189, 0.10218, 3.6122]) < 1e-4 AND
relative_error(p_values, ARRAY[0.62971, 0.18363, 0.92045, 0.0040816]) < 1e-4 AND
relative_error(condition_no, 9783.0184) < 1e-4,
'Linear regression (houses): Wrong results'
) FROM (
SELECT (linregr(price, array[1, bedroom, bath, size])).*
FROM houses
) q;
-- Merge functions must prodice valid results when being called with the
-- aggregate's initial state
SELECT assert(
(linregr).r2 = 1,
'Linear regression failed on singleton data set.'
) FROM (
SELECT linregr_final(
linregr_merge_states(
linregr_transition(CAST('' AS bytea8), 3, ARRAY[5,2]),
CAST('' AS bytea8)
)
) AS linregr
) ignored;
SELECT assert(
(linregr).r2 = 1,
'Linear regression failed on singleton data set.'
) FROM (
SELECT linregr_final(
linregr_merge_states(
CAST('' AS bytea8),
linregr_transition(CAST('' AS bytea8), 3, ARRAY[5,2])
)
) AS linregr
) ignored;
------------------------------------------------------------------------
drop table if exists result_lin_houses;
drop table if exists result_lin_houses_summary;
select linregr_train('houses', 'result_lin_houses', 'price',
'array[1, tax, bath, size]',
'bedroom', True);
select assert(
relative_error(coef, array[-84242.0345, 55.4430, -78966.9754, 225.6119]) < 1e-2 and
relative_error(r2, 0.9688) < 1e-2 and
relative_error(std_err, array[35019.00, 19.57, 23036.81, 49.04]) < 1e-2 and
relative_error(t_stats, array[-2.406, 2.833, -3.428, 4.600]) < 1e-2 and
relative_error(p_values, array[0.251, 0.216, 0.181, 0.136]) < 1e-2 and
relative_error(condition_no, 10086.1) < 1e-2 and
relative_error(bp_stats, 2.5451) < 1e-2 and
relative_error(bp_p_value, 0.467192) < 1e-2,
'Linear regression with heteroskedasticity & grouping (houses): Wrong results'
)
from result_lin_houses
where bedroom = 2;
select assert(
relative_error(coef, array[-88155.8292502747,27.1966436293179,41404.0293389239,62.6375210724027]) < 1e-2 and
relative_error(r2, 0.841699901312963) < 1e-2 and
relative_error(std_err, array[57867.9999699512,17.82723091538,43643.1321521931,70.8506824870639]) < 1e-2 and
relative_error(t_stats, array[-1.52339512850022,1.52556747362568,0.948695185179172,0.884077878626493]) < 1e-2 and
relative_error(p_values, array[0.18816143289241,0.187636685729725,0.38634003235866,0.417132778730133]) < 1e-2 and
relative_error(condition_no, 11722.62) < 1e-2 and
relative_error(bp_stats, 6.753838) < 1e-2 and
relative_error(bp_p_value, 0.08017) < 1e-2,
'Linear regression with heteroskedasticity & grouping (houses): Wrong results'
)
from result_lin_houses
where bedroom = 3;
SELECT id,
linregr_predict(coef, ARRAY[1, tax, bath, size]) as pred_value
FROM houses h, result_lin_houses m
ORDER BY id;
drop table if exists result_lin_houses cascade;
select linregr_train();
select linregr_train('usage');
select linregr_train('example');
select linregr_predict();
select linregr_predict('usage');
select linregr_predict('example');