blob: 612914a2355ad90e639cb671c9591eb9dc965b1b [file] [log] [blame]
m4_include(`SQLCommon.m4')
/* -----------------------------------------------------------------------------
* Test Linear Regression.
* -------------------------------------------------------------------------- */
/*
* The following example is taken from:
* http://www.weibull.com/DOEWeb/estimating_regression_models_using_least_squares.htm#Example%205.1
* http://www.weibull.com/DOEWeb/hypothesis_tests_in_multiple_linear_regression.htm#Example%205.3
*/
DROP TABLE IF EXISTS weibull;
CREATE TABLE weibull (
id INTEGER NOT NULL,
x1 DOUBLE PRECISION,
x2 DOUBLE PRECISION,
y DOUBLE PRECISION
) m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (id)');
/*
* We materialize the table here because on MPP systems we want to distribute
* the data to the segments (and not do the calculations in memory).
*/
INSERT INTO weibull(id, x1, x2, y) VALUES
( 1, 41.9, 29.1, 251.3),
( 2, 43.4, 29.3, 251.3),
( 3, 43.9, 29.5, 248.3),
( 4, 44.5, 29.7, 267.5),
( 5, 47.3, 29.9, 273.0),
( 6, 47.5, 30.3, 276.5),
( 7, 47.9, 30.5, 270.3),
( 8, 50.2, 30.7, 274.9),
( 9, 52.8, 30.8, 285.0),
(10, 53.2, 30.9, 290.0),
(11, 56.7, 31.5, 297.0),
(12, 57.0, 31.7, 302.5),
(13, 63.5, 31.9, 304.5),
(14, 65.3, 32.0, 309.3),
(15, 71.1, 32.1, 321.7),
(16, 77.0, 32.5, 330.7),
(17, 77.8, 32.9, 349.0),
-- Add NULL values
(18, NULL, 32.1, 321.7),
(19, 77.0, NULL, 330.7),
(20, 77.8, 32.9, NULL);
SELECT assert(
relative_error(coef, ARRAY[-153.51, 1.24, 12.08]) < 1e-4 AND
relative_error(t_stats[2], 3.1393) < 1e-4 AND
relative_error(t_stats[3], 3.0726) < 1e-4 AND
relative_error(p_values[3], 0.0083) < 1e-2,
'Linear regression (weibull.com test): Wrong results'
) FROM (
SELECT (linregr(y, ARRAY[1, x1, x2])).*
FROM weibull
) q;
/*
* The following example is taken from:
* http://biocomp.health.unm.edu/biomed505/Course/Cheminformatics/advanced/data_classification_qsar/linear_multilinear_regression.pdf
*/
DROP TABLE IF EXISTS unm;
CREATE TABLE unm (
id INTEGER NOT NULL,
x1 DOUBLE PRECISION,
x2 DOUBLE PRECISION,
y DOUBLE PRECISION
);
INSERT INTO unm(id, x1, x2, y) VALUES
(1, 0, 0.30, 10.14),
(2, 0.69, 0.60, 11.93),
(3, 1.10, 0.90, 13.57),
(4, 1.39, 1.20, 14.17),
(5, 1.61, 1.50, 15.25),
(6, 1.79, 1.80, 16.15),
-- Add NULL values
(7, NULL, 1.20, 14.17),
(8, 1.61, NULL, 15.25),
(9, 1.79, 1.80, NULL);
SELECT assert(
relative_error(coef, ARRAY[9.69, 2.09, 1.50]) < 1e-2,
'Linear regression (unm): Wrong coefficients'
) FROM (
SELECT (linregr(y, ARRAY[1, x1, x2])).*
FROM unm
) q;
/*
* The following example is taken from:
* http://www.stat.columbia.edu/~martin/W2110/SAS_7.pdf
*/
DROP TABLE IF EXISTS houses;
CREATE TABLE houses (
id SERIAL NOT NULL,
tax INTEGER,
bedroom REAL,
bath REAL,
price INTEGER,
size INTEGER,
lot INTEGER
);
INSERT INTO houses(tax, bedroom, bath, price, size, lot) VALUES
( 590, 2, 1, 50000, 770, 22100),
(1050, 3, 2, 85000, 1410, 12000),
( 20, 3, 1, 22500, 1060, 3500 ),
( 870, 2, 2, 90000, 1300, 17500),
(1320, 3, 2, 133000, 1500, 30000),
(1350, 2, 1, 90500, 820, 25700),
(2790, 3, 2.5, 260000, 2130, 25000),
( 680, 2, 1, 142500, 1170, 22000),
(1840, 3, 2, 160000, 1500, 19000),
(3680, 4, 2, 240000, 2790, 20000),
(1660, 3, 1, 87000, 1030, 17500),
(1620, 3, 2, 118600, 1250, 20000),
(3100, 3, 2, 140000, 1760, 38000),
(2070, 2, 3, 148000, 1550, 14000),
( 650, 3, 1.5, 65000, 1450, 12000);
-- Values computed with MADlib
SELECT assert(
relative_error(coef, ARRAY[27923.43, -35524.78, 2269.34, 130.79]) < 1e-4 AND
relative_error(r2, 0.74537) < 1e-3 AND
relative_error(std_err, ARRAY[56306, 25037, 22208, 36.209]) < 1e-4 AND
relative_error(t_stats, ARRAY[0.49592, -1.4189, 0.10218, 3.6122]) < 1e-4 AND
relative_error(p_values, ARRAY[0.62971, 0.18363, 0.92045, 0.0040816]) < 1e-4 AND
relative_error(condition_no, 9783.0184) < 1e-4,
'Linear regression (houses): Wrong results'
) FROM (
SELECT (linregr(price, array[1, bedroom, bath, size])).*
FROM houses
) q;
-- Merge functions must prodice valid results when being called with the
-- aggregate's initial state
SELECT assert(
(linregr).r2 = 1,
'Linear regression failed on singleton data set.'
) FROM (
SELECT linregr_final(
linregr_merge_states(
linregr_transition(CAST('' AS bytea8), 3, ARRAY[5,2]),
CAST('' AS bytea8)
)
) AS linregr
) ignored;
SELECT assert(
(linregr).r2 = 1,
'Linear regression failed on singleton data set.'
) FROM (
SELECT linregr_final(
linregr_merge_states(
CAST('' AS bytea8),
linregr_transition(CAST('' AS bytea8), 3, ARRAY[5,2])
)
) AS linregr
) ignored;
------------------------------------------------------------------------
drop table if exists result_lin_houses;
drop table if exists result_lin_houses_summary;
select linregr_train('houses', 'result_lin_houses', 'price',
'array[1, tax, bath, size]',
'bedroom', True);
select assert(
relative_error(coef, array[-84242.0345, 55.4430, -78966.9754, 225.6119]) < 1e-2 and
relative_error(r2, 0.9688) < 1e-2 and
relative_error(std_err, array[35019.00, 19.57, 23036.81, 49.04]) < 1e-2 and
relative_error(t_stats, array[-2.406, 2.833, -3.428, 4.600]) < 1e-2 and
relative_error(p_values, array[0.251, 0.216, 0.181, 0.136]) < 1e-2 and
relative_error(condition_no, 10086.1) < 1e-2 and
relative_error(bp_stats, 2.5451) < 1e-2 and
relative_error(bp_p_value, 0.467192) < 1e-2,
'Linear regression with heteroskedasticity & grouping (houses): Wrong results'
)
from result_lin_houses
where bedroom = 2;
select assert(
relative_error(coef, array[-88155.8292502747,27.1966436293179,41404.0293389239,62.6375210724027]) < 1e-2 and
relative_error(r2, 0.841699901312963) < 1e-2 and
relative_error(std_err, array[57867.9999699512,17.82723091538,43643.1321521931,70.8506824870639]) < 1e-2 and
relative_error(t_stats, array[-1.52339512850022,1.52556747362568,0.948695185179172,0.884077878626493]) < 1e-2 and
relative_error(p_values, array[0.18816143289241,0.187636685729725,0.38634003235866,0.417132778730133]) < 1e-2 and
relative_error(condition_no, 11722.62) < 1e-2 and
relative_error(bp_stats, 6.753838) < 1e-2 and
relative_error(bp_p_value, 0.08017) < 1e-2,
'Linear regression with heteroskedasticity & grouping (houses): Wrong results'
)
from result_lin_houses
where bedroom = 3;
SELECT id,
linregr_predict(coef, ARRAY[1, tax, bath, size]) as pred_value
FROM houses h, result_lin_houses m
ORDER BY id;
drop table if exists result_lin_houses cascade;
select linregr_train();
select linregr_train('usage');
select linregr_train('example');
select linregr_predict();
select linregr_predict('usage');
select linregr_predict('example');
------------------------------------------------------------------------
-- Test case for JSON Data Type and Special characters.
-- This function checks special characters for GPDB version = 4.x and JSON for 5.x and above
create or replace function linereg_expr_test() RETURNS VOID AS
$$
DECLARE col_type TEXT;
begin
select typname into col_type from pg_type where typname = 'json' ;
if col_type = 'json' THEN
DROP TABLE IF EXISTS houses_json;
CREATE TABLE houses_json (
id SERIAL NOT NULL,
data json
);
INSERT INTO houses_json VALUES
( 2 ,'{ "ta,x": 590, "bedroom":2, "ba$th":1, "pr''ice": 50000, "size": 770, "lot":22100 }'),
( 4 ,'{ "ta,x": 1050, "bedroom":3, "ba$th":2, "pr''ice": 85000, "size":1410, "lot":12000 }'),
( 1 ,'{ "ta,x": 20, "bedroom":3, "ba$th":1, "pr''ice": 22500, "size":1060, "lot":3500 }'),
( 6 ,'{ "ta,x": 870, "bedroom":2, "ba$th":2, "pr''ice": 90000, "size":1300, "lot":17500 }'),
( 3 ,'{ "ta,x": 1320, "bedroom":3, "ba$th":2, "pr''ice": 133000, "size":1500, "lot":30000 }'),
( 5 ,'{ "ta,x": 1350, "bedroom":2, "ba$th":1, "pr''ice": 90500, "size": 820, "lot":25700 }'),
( 7 ,'{ "ta,x": 2790, "bedroom":3, "ba$th":2.5, "pr''ice": 260000, "size":2130, "lot":25000 }'),
( 10 ,'{ "ta,x": 680, "bedroom":2, "ba$th":1, "pr''ice": 142500, "size":1170, "lot":22000 }'),
( 9 ,'{ "ta,x": 1840, "bedroom":3, "ba$th":2, "pr''ice": 160000, "size":1500, "lot":19000 }'),
( 8 ,'{ "ta,x": 3680, "bedroom":4, "ba$th":2, "pr''ice": 240000, "size":2790, "lot":20000 }'),
( 11 ,'{ "ta,x": 1660, "bedroom":3, "ba$th":1, "pr''ice": 87000, "size":1030, "lot":17500 }'),
( 12 ,'{ "ta,x": 1620, "bedroom":3, "ba$th":2, "pr''ice": 118600, "size":1250, "lot":20000 }'),
( 13 ,'{ "ta,x": 3100, "bedroom":3, "ba$th":2, "pr''ice": 140000, "size":1760, "lot":38000 }'),
( 14 ,'{ "ta,x": 2070, "bedroom":2, "ba$th":3, "pr''ice": 148000, "size":1550, "lot":14000 }'),
( 15 ,'{ "ta,x": 650, "bedroom":3, "ba$th":1.5, "pr''ice": 65000, "size":1450, "lot":12000 }');
drop table if exists result_lin_houses_json;
drop table if exists result_lin_houses_json_summary;
PERFORM linregr_train('houses_json', 'result_lin_houses_json', '(data->>''pr''''ice'')::integer',
'array[1, (data->>''ta,x'')::integer, (data->>''ba$th'')::double precision, (data->>''size'')::integer]',
'data->>''bedroom''', True);
PERFORM assert(
relative_error(coef, array[-84242.0345, 55.4430, -78966.9754, 225.6119]) < 1e-2 and
relative_error(r2, 0.9688) < 1e-2 and
relative_error(std_err, array[35019.00, 19.57, 23036.81, 49.04]) < 1e-2 and
relative_error(t_stats, array[-2.406, 2.833, -3.428, 4.600]) < 1e-2 and
relative_error(p_values, array[0.251, 0.216, 0.181, 0.136]) < 1e-2 and
relative_error(condition_no, 10086.1) < 1e-2 and
relative_error(bp_stats, 2.5451) < 1e-2 and
relative_error(bp_p_value, 0.467192) < 1e-2,
'Linear regression with heteroskedasticity & grouping (houses): Wrong results'
)
from result_lin_houses_json
where "data->>'bedroom'"::integer = 2;
else
DROP TABLE IF EXISTS houses_spcl;
CREATE TABLE houses_spcl (
id SERIAL NOT NULL,
"ta,x" INTEGER,
"bed,room" REAL,
"ba$th" REAL,
"pr'ice" INTEGER,
size INTEGER,
lot INTEGER
);
INSERT INTO houses_spcl("ta,x", "bed,room", "ba$th", "pr'ice", size, lot) VALUES
( 590, 2, 1, 50000, 770, 22100),
(1050, 3, 2, 85000, 1410, 12000),
( 20, 3, 1, 22500, 1060, 3500 ),
( 870, 2, 2, 90000, 1300, 17500),
(1320, 3, 2, 133000, 1500, 30000),
(1350, 2, 1, 90500, 820, 25700),
(2790, 3, 2.5, 260000, 2130, 25000),
( 680, 2, 1, 142500, 1170, 22000),
(1840, 3, 2, 160000, 1500, 19000),
(3680, 4, 2, 240000, 2790, 20000),
(1660, 3, 1, 87000, 1030, 17500),
(1620, 3, 2, 118600, 1250, 20000),
(3100, 3, 2, 140000, 1760, 38000),
(2070, 2, 3, 148000, 1550, 14000),
( 650, 3, 1.5, 65000, 1450, 12000);
drop table if exists result_lin_houses_spcl;
drop table if exists result_lin_houses_spcl_summary;
PERFORM linregr_train('houses_spcl', 'result_lin_houses_spcl', '"pr''ice"',
'array[1, "ta,x", "ba$th", size]',
'"bed,room"', True);
PERFORM assert(
relative_error(coef, array[-84242.0345, 55.4430, -78966.9754, 225.6119]) < 1e-2 and
relative_error(r2, 0.9688) < 1e-2 and
relative_error(std_err, array[35019.00, 19.57, 23036.81, 49.04]) < 1e-2 and
relative_error(t_stats, array[-2.406, 2.833, -3.428, 4.600]) < 1e-2 and
relative_error(p_values, array[0.251, 0.216, 0.181, 0.136]) < 1e-2 and
relative_error(condition_no, 10086.1) < 1e-2 and
relative_error(bp_stats, 2.5451) < 1e-2 and
relative_error(bp_p_value, 0.467192) < 1e-2,
'Linear regression with heteroskedasticity & grouping (houses): Wrong results'
)
from result_lin_houses_spcl
where result_lin_houses_spcl."bed,room" = 2;
end IF;
end;
$$ LANGUAGE plpgsql;
select linereg_expr_test();
------------------------------------------------------------------------
-- Test Case for : when grouping_cols is NULL and the heteroskedasticity_option is True
drop table if exists result_lin_houses;
drop table if exists result_lin_houses_summary;
select linregr_train('houses', 'result_lin_houses', 'price',
'array[1, tax, bath, size]',
NULL, True);
select assert(
relative_error(coef, array[-12849.4168959872,28.9613922651765,10181.6290712648,50.516894915354]) < 1e-2 and
relative_error(r2, 0.768577580597443) < 1e-2 and
relative_error(std_err, array[33453.0344331391,15.8992104963997,19437.7710925923,32.928023174087]) < 1e-2 and
relative_error(t_stats, array[-0.38410317968819,1.82156166004184,0.523806408809133,1.53416118083605]) < 1e-2 and
relative_error(p_values, array[0.708223134615422,0.0958005827189772,0.610804093526536,0.153235085548186]) < 1e-2 and
relative_error(condition_no, 9002.50) < 1e-2 and
relative_error(bp_stats, 1.22605243985138) < 1e-2 and
relative_error(bp_p_value, 0.746762880478034) < 1e-2,
'Linear regression with heteroskedasticity with no grouping: Wrong results'
)
from result_lin_houses;