| m4_include(`SQLCommon.m4') |
| |
| /* ----------------------------------------------------------------------------- |
| * Test Linear Regression. |
| * -------------------------------------------------------------------------- */ |
| |
| /* |
| * The following example is taken from: |
| * http://www.weibull.com/DOEWeb/estimating_regression_models_using_least_squares.htm#Example%205.1 |
| * http://www.weibull.com/DOEWeb/hypothesis_tests_in_multiple_linear_regression.htm#Example%205.3 |
| */ |
| DROP TABLE IF EXISTS weibull; |
| CREATE TABLE weibull ( |
| id INTEGER NOT NULL, |
| x1 DOUBLE PRECISION, |
| x2 DOUBLE PRECISION, |
| y DOUBLE PRECISION |
| ) m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (id)'); |
| |
| /* |
| * We materialize the table here because on MPP systems we want to distribute |
| * the data to the segments (and not do the calculations in memory). |
| */ |
| INSERT INTO weibull(id, x1, x2, y) VALUES |
| ( 1, 41.9, 29.1, 251.3), |
| ( 2, 43.4, 29.3, 251.3), |
| ( 3, 43.9, 29.5, 248.3), |
| ( 4, 44.5, 29.7, 267.5), |
| ( 5, 47.3, 29.9, 273.0), |
| ( 6, 47.5, 30.3, 276.5), |
| ( 7, 47.9, 30.5, 270.3), |
| ( 8, 50.2, 30.7, 274.9), |
| ( 9, 52.8, 30.8, 285.0), |
| (10, 53.2, 30.9, 290.0), |
| (11, 56.7, 31.5, 297.0), |
| (12, 57.0, 31.7, 302.5), |
| (13, 63.5, 31.9, 304.5), |
| (14, 65.3, 32.0, 309.3), |
| (15, 71.1, 32.1, 321.7), |
| (16, 77.0, 32.5, 330.7), |
| (17, 77.8, 32.9, 349.0), |
| -- Add NULL values |
| (18, NULL, 32.1, 321.7), |
| (19, 77.0, NULL, 330.7), |
| (20, 77.8, 32.9, NULL); |
| |
| SELECT assert( |
| relative_error(coef, ARRAY[-153.51, 1.24, 12.08]) < 1e-4 AND |
| relative_error(t_stats[2], 3.1393) < 1e-4 AND |
| relative_error(t_stats[3], 3.0726) < 1e-4 AND |
| relative_error(p_values[3], 0.0083) < 1e-2, |
| 'Linear regression (weibull.com test): Wrong results' |
| ) FROM ( |
| SELECT (linregr(y, ARRAY[1, x1, x2])).* |
| FROM weibull |
| ) q; |
| |
| |
| |
| /* |
| * The following example is taken from: |
| * http://biocomp.health.unm.edu/biomed505/Course/Cheminformatics/advanced/data_classification_qsar/linear_multilinear_regression.pdf |
| */ |
| DROP TABLE IF EXISTS unm; |
| CREATE TABLE unm ( |
| id INTEGER NOT NULL, |
| x1 DOUBLE PRECISION, |
| x2 DOUBLE PRECISION, |
| y DOUBLE PRECISION |
| ); |
| |
| INSERT INTO unm(id, x1, x2, y) VALUES |
| (1, 0, 0.30, 10.14), |
| (2, 0.69, 0.60, 11.93), |
| (3, 1.10, 0.90, 13.57), |
| (4, 1.39, 1.20, 14.17), |
| (5, 1.61, 1.50, 15.25), |
| (6, 1.79, 1.80, 16.15), |
| -- Add NULL values |
| (7, NULL, 1.20, 14.17), |
| (8, 1.61, NULL, 15.25), |
| (9, 1.79, 1.80, NULL); |
| |
| SELECT assert( |
| relative_error(coef, ARRAY[9.69, 2.09, 1.50]) < 1e-2, |
| 'Linear regression (unm): Wrong coefficients' |
| ) FROM ( |
| SELECT (linregr(y, ARRAY[1, x1, x2])).* |
| FROM unm |
| ) q; |
| |
| |
| /* |
| * The following example is taken from: |
| * http://www.stat.columbia.edu/~martin/W2110/SAS_7.pdf |
| */ |
| |
| DROP TABLE IF EXISTS houses; |
| CREATE TABLE houses ( |
| id SERIAL NOT NULL, |
| tax INTEGER, |
| bedroom REAL, |
| bath REAL, |
| price INTEGER, |
| size INTEGER, |
| lot INTEGER |
| ); |
| |
| INSERT INTO houses(tax, bedroom, bath, price, size, lot) VALUES |
| ( 590, 2, 1, 50000, 770, 22100), |
| (1050, 3, 2, 85000, 1410, 12000), |
| ( 20, 3, 1, 22500, 1060, 3500 ), |
| ( 870, 2, 2, 90000, 1300, 17500), |
| (1320, 3, 2, 133000, 1500, 30000), |
| (1350, 2, 1, 90500, 820, 25700), |
| (2790, 3, 2.5, 260000, 2130, 25000), |
| ( 680, 2, 1, 142500, 1170, 22000), |
| (1840, 3, 2, 160000, 1500, 19000), |
| (3680, 4, 2, 240000, 2790, 20000), |
| (1660, 3, 1, 87000, 1030, 17500), |
| (1620, 3, 2, 118600, 1250, 20000), |
| (3100, 3, 2, 140000, 1760, 38000), |
| (2070, 2, 3, 148000, 1550, 14000), |
| ( 650, 3, 1.5, 65000, 1450, 12000); |
| |
| -- Values computed with MADlib |
| SELECT assert( |
| relative_error(coef, ARRAY[27923.43, -35524.78, 2269.34, 130.79]) < 1e-4 AND |
| relative_error(r2, 0.74537) < 1e-3 AND |
| relative_error(std_err, ARRAY[56306, 25037, 22208, 36.209]) < 1e-4 AND |
| relative_error(t_stats, ARRAY[0.49592, -1.4189, 0.10218, 3.6122]) < 1e-4 AND |
| relative_error(p_values, ARRAY[0.62971, 0.18363, 0.92045, 0.0040816]) < 1e-4 AND |
| relative_error(condition_no, 9783.0184) < 1e-4, |
| 'Linear regression (houses): Wrong results' |
| ) FROM ( |
| SELECT (linregr(price, array[1, bedroom, bath, size])).* |
| FROM houses |
| ) q; |
| |
| -- Merge functions must prodice valid results when being called with the |
| -- aggregate's initial state |
| SELECT assert( |
| (linregr).r2 = 1, |
| 'Linear regression failed on singleton data set.' |
| ) FROM ( |
| SELECT linregr_final( |
| linregr_merge_states( |
| linregr_transition(CAST('' AS bytea8), 3, ARRAY[5,2]), |
| CAST('' AS bytea8) |
| ) |
| ) AS linregr |
| ) ignored; |
| |
| SELECT assert( |
| (linregr).r2 = 1, |
| 'Linear regression failed on singleton data set.' |
| ) FROM ( |
| SELECT linregr_final( |
| linregr_merge_states( |
| CAST('' AS bytea8), |
| linregr_transition(CAST('' AS bytea8), 3, ARRAY[5,2]) |
| ) |
| ) AS linregr |
| ) ignored; |
| |
| ------------------------------------------------------------------------ |
| |
| drop table if exists result_lin_houses; |
| drop table if exists result_lin_houses_summary; |
| select linregr_train('houses', 'result_lin_houses', 'price', |
| 'array[1, tax, bath, size]', |
| 'bedroom', True); |
| |
| select assert( |
| relative_error(coef, array[-84242.0345, 55.4430, -78966.9754, 225.6119]) < 1e-2 and |
| relative_error(r2, 0.9688) < 1e-2 and |
| relative_error(std_err, array[35019.00, 19.57, 23036.81, 49.04]) < 1e-2 and |
| relative_error(t_stats, array[-2.406, 2.833, -3.428, 4.600]) < 1e-2 and |
| relative_error(p_values, array[0.251, 0.216, 0.181, 0.136]) < 1e-2 and |
| relative_error(condition_no, 10086.1) < 1e-2 and |
| relative_error(bp_stats, 2.5451) < 1e-2 and |
| relative_error(bp_p_value, 0.467192) < 1e-2, |
| 'Linear regression with heteroskedasticity & grouping (houses): Wrong results' |
| ) |
| from result_lin_houses |
| where bedroom = 2; |
| |
| select assert( |
| relative_error(coef, array[-88155.8292502747,27.1966436293179,41404.0293389239,62.6375210724027]) < 1e-2 and |
| relative_error(r2, 0.841699901312963) < 1e-2 and |
| relative_error(std_err, array[57867.9999699512,17.82723091538,43643.1321521931,70.8506824870639]) < 1e-2 and |
| relative_error(t_stats, array[-1.52339512850022,1.52556747362568,0.948695185179172,0.884077878626493]) < 1e-2 and |
| relative_error(p_values, array[0.18816143289241,0.187636685729725,0.38634003235866,0.417132778730133]) < 1e-2 and |
| relative_error(condition_no, 11722.62) < 1e-2 and |
| relative_error(bp_stats, 6.753838) < 1e-2 and |
| relative_error(bp_p_value, 0.08017) < 1e-2, |
| 'Linear regression with heteroskedasticity & grouping (houses): Wrong results' |
| ) |
| from result_lin_houses |
| where bedroom = 3; |
| |
| SELECT id, |
| linregr_predict(coef, ARRAY[1, tax, bath, size]) as pred_value |
| FROM houses h, result_lin_houses m |
| ORDER BY id; |
| |
| drop table if exists result_lin_houses cascade; |
| |
| select linregr_train(); |
| select linregr_train('usage'); |
| select linregr_train('example'); |
| |
| select linregr_predict(); |
| select linregr_predict('usage'); |
| select linregr_predict('example'); |
| |
| |
| |
| |
| |
| ------------------------------------------------------------------------ |
| |
| |
| -- Test case for JSON Data Type and Special characters. |
| -- This function checks special characters for GPDB version = 4.x and JSON for 5.x and above |
| |
| create or replace function linereg_expr_test() RETURNS VOID AS |
| $$ |
| DECLARE col_type TEXT; |
| |
| begin |
| select typname into col_type from pg_type where typname = 'json' ; |
| if col_type = 'json' THEN |
| |
| DROP TABLE IF EXISTS houses_json; |
| CREATE TABLE houses_json ( |
| id SERIAL NOT NULL, |
| data json |
| ); |
| |
| |
| INSERT INTO houses_json VALUES |
| ( 2 ,'{ "ta,x": 590, "bedroom":2, "ba$th":1, "pr''ice": 50000, "size": 770, "lot":22100 }'), |
| ( 4 ,'{ "ta,x": 1050, "bedroom":3, "ba$th":2, "pr''ice": 85000, "size":1410, "lot":12000 }'), |
| ( 1 ,'{ "ta,x": 20, "bedroom":3, "ba$th":1, "pr''ice": 22500, "size":1060, "lot":3500 }'), |
| ( 6 ,'{ "ta,x": 870, "bedroom":2, "ba$th":2, "pr''ice": 90000, "size":1300, "lot":17500 }'), |
| ( 3 ,'{ "ta,x": 1320, "bedroom":3, "ba$th":2, "pr''ice": 133000, "size":1500, "lot":30000 }'), |
| ( 5 ,'{ "ta,x": 1350, "bedroom":2, "ba$th":1, "pr''ice": 90500, "size": 820, "lot":25700 }'), |
| ( 7 ,'{ "ta,x": 2790, "bedroom":3, "ba$th":2.5, "pr''ice": 260000, "size":2130, "lot":25000 }'), |
| ( 10 ,'{ "ta,x": 680, "bedroom":2, "ba$th":1, "pr''ice": 142500, "size":1170, "lot":22000 }'), |
| ( 9 ,'{ "ta,x": 1840, "bedroom":3, "ba$th":2, "pr''ice": 160000, "size":1500, "lot":19000 }'), |
| ( 8 ,'{ "ta,x": 3680, "bedroom":4, "ba$th":2, "pr''ice": 240000, "size":2790, "lot":20000 }'), |
| ( 11 ,'{ "ta,x": 1660, "bedroom":3, "ba$th":1, "pr''ice": 87000, "size":1030, "lot":17500 }'), |
| ( 12 ,'{ "ta,x": 1620, "bedroom":3, "ba$th":2, "pr''ice": 118600, "size":1250, "lot":20000 }'), |
| ( 13 ,'{ "ta,x": 3100, "bedroom":3, "ba$th":2, "pr''ice": 140000, "size":1760, "lot":38000 }'), |
| ( 14 ,'{ "ta,x": 2070, "bedroom":2, "ba$th":3, "pr''ice": 148000, "size":1550, "lot":14000 }'), |
| ( 15 ,'{ "ta,x": 650, "bedroom":3, "ba$th":1.5, "pr''ice": 65000, "size":1450, "lot":12000 }'); |
| |
| drop table if exists result_lin_houses_json; |
| drop table if exists result_lin_houses_json_summary; |
| PERFORM linregr_train('houses_json', 'result_lin_houses_json', '(data->>''pr''''ice'')::integer', |
| 'array[1, (data->>''ta,x'')::integer, (data->>''ba$th'')::double precision, (data->>''size'')::integer]', |
| 'data->>''bedroom''', True); |
| PERFORM assert( |
| relative_error(coef, array[-84242.0345, 55.4430, -78966.9754, 225.6119]) < 1e-2 and |
| relative_error(r2, 0.9688) < 1e-2 and |
| relative_error(std_err, array[35019.00, 19.57, 23036.81, 49.04]) < 1e-2 and |
| relative_error(t_stats, array[-2.406, 2.833, -3.428, 4.600]) < 1e-2 and |
| relative_error(p_values, array[0.251, 0.216, 0.181, 0.136]) < 1e-2 and |
| relative_error(condition_no, 10086.1) < 1e-2 and |
| relative_error(bp_stats, 2.5451) < 1e-2 and |
| relative_error(bp_p_value, 0.467192) < 1e-2, |
| 'Linear regression with heteroskedasticity & grouping (houses): Wrong results' |
| ) |
| from result_lin_houses_json |
| where "data->>'bedroom'"::integer = 2; |
| |
| else |
| |
| |
| |
| DROP TABLE IF EXISTS houses_spcl; |
| CREATE TABLE houses_spcl ( |
| id SERIAL NOT NULL, |
| "ta,x" INTEGER, |
| "bed,room" REAL, |
| "ba$th" REAL, |
| "pr'ice" INTEGER, |
| size INTEGER, |
| lot INTEGER |
| ); |
| |
| INSERT INTO houses_spcl("ta,x", "bed,room", "ba$th", "pr'ice", size, lot) VALUES |
| ( 590, 2, 1, 50000, 770, 22100), |
| (1050, 3, 2, 85000, 1410, 12000), |
| ( 20, 3, 1, 22500, 1060, 3500 ), |
| ( 870, 2, 2, 90000, 1300, 17500), |
| (1320, 3, 2, 133000, 1500, 30000), |
| (1350, 2, 1, 90500, 820, 25700), |
| (2790, 3, 2.5, 260000, 2130, 25000), |
| ( 680, 2, 1, 142500, 1170, 22000), |
| (1840, 3, 2, 160000, 1500, 19000), |
| (3680, 4, 2, 240000, 2790, 20000), |
| (1660, 3, 1, 87000, 1030, 17500), |
| (1620, 3, 2, 118600, 1250, 20000), |
| (3100, 3, 2, 140000, 1760, 38000), |
| (2070, 2, 3, 148000, 1550, 14000), |
| ( 650, 3, 1.5, 65000, 1450, 12000); |
| |
| drop table if exists result_lin_houses_spcl; |
| drop table if exists result_lin_houses_spcl_summary; |
| PERFORM linregr_train('houses_spcl', 'result_lin_houses_spcl', '"pr''ice"', |
| 'array[1, "ta,x", "ba$th", size]', |
| '"bed,room"', True); |
| PERFORM assert( |
| relative_error(coef, array[-84242.0345, 55.4430, -78966.9754, 225.6119]) < 1e-2 and |
| relative_error(r2, 0.9688) < 1e-2 and |
| relative_error(std_err, array[35019.00, 19.57, 23036.81, 49.04]) < 1e-2 and |
| relative_error(t_stats, array[-2.406, 2.833, -3.428, 4.600]) < 1e-2 and |
| relative_error(p_values, array[0.251, 0.216, 0.181, 0.136]) < 1e-2 and |
| relative_error(condition_no, 10086.1) < 1e-2 and |
| relative_error(bp_stats, 2.5451) < 1e-2 and |
| relative_error(bp_p_value, 0.467192) < 1e-2, |
| 'Linear regression with heteroskedasticity & grouping (houses): Wrong results' |
| ) |
| from result_lin_houses_spcl |
| where result_lin_houses_spcl."bed,room" = 2; |
| |
| |
| end IF; |
| |
| end; |
| $$ LANGUAGE plpgsql; |
| |
| |
| select linereg_expr_test(); |
| |
| |
| |
| ------------------------------------------------------------------------ |
| |
| -- Test Case for : when grouping_cols is NULL and the heteroskedasticity_option is True |
| |
| |
| |
| |
| drop table if exists result_lin_houses; |
| drop table if exists result_lin_houses_summary; |
| select linregr_train('houses', 'result_lin_houses', 'price', |
| 'array[1, tax, bath, size]', |
| NULL, True); |
| |
| select assert( |
| relative_error(coef, array[-12849.4168959872,28.9613922651765,10181.6290712648,50.516894915354]) < 1e-2 and |
| relative_error(r2, 0.768577580597443) < 1e-2 and |
| relative_error(std_err, array[33453.0344331391,15.8992104963997,19437.7710925923,32.928023174087]) < 1e-2 and |
| relative_error(t_stats, array[-0.38410317968819,1.82156166004184,0.523806408809133,1.53416118083605]) < 1e-2 and |
| relative_error(p_values, array[0.708223134615422,0.0958005827189772,0.610804093526536,0.153235085548186]) < 1e-2 and |
| relative_error(condition_no, 9002.50) < 1e-2 and |
| relative_error(bp_stats, 1.22605243985138) < 1e-2 and |
| relative_error(bp_p_value, 0.746762880478034) < 1e-2, |
| 'Linear regression with heteroskedasticity with no grouping: Wrong results' |
| ) |
| from result_lin_houses; |
| |