blob: 6806c14403d3584645ef69c0f039dd27c6154ddc [file] [log] [blame]
---------------------------------------------------------------------------
-- Rules:
-- ------
-- 1) Any DB objects should be created w/o schema prefix,
-- since this file is executed in a separate schema context.
-- 2) There should be no DROP statements in this script, since
-- all objects created in the default schema will be cleaned-up outside.
---------------------------------------------------------------------------
---------------------------------------------------------------------------
-- Setup:
---------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION install_test() RETURNS VOID AS $$
declare
lr MADLIB_SCHEMA.linregr_result;
n FLOAT;
xsq FLOAT;
acoef FLOAT[];
arsq FLOAT;
atst1 FLOAT;
atst2 FLOAT;
atst FLOAT[];
result INT;
result2 INT;
begin
-- DROP TABLE IF EXISTS data;
CREATE TABLE data(x1 numeric,x2 float,noise numeric);
INSERT INTO data(x1,x2,noise) SELECT (random()-0.5),(random()-0.5),(random()-0.5) FROM generate_series(1,100) as a;
-- DROP TABLE IF EXISTS data2;
CREATE TABLE data2(r1 float, r2 float, r3 float, y numeric);
INSERT INTO data2(r1,r2,r3,y) SELECT x1,x2,noise,(x1*3.4+x2*2.5+noise) FROM data;
lr := (SELECT MADLIB_SCHEMA.linregr(y,array[r1,1]) FROM data2);
--check univariate results
SELECT count(y) INTO n FROM data2;
SELECT sum(r1*r1)::float INTO xsq FROM data2;
SELECT ARRAY[ ((count(y) * sum(r1*y) - sum(r1)*sum(y))/(count(y) * sum(r1*r1) - sum(r1)^2)),
((sum(y)*sum(r1*r1) - sum(r1)*sum(r1*y))/(count(y) * sum(r1*r1) - sum(r1)^2))]::float[] INTO acoef FROM data2;
SELECT ((count(y) * sum(r1*y) - sum(r1)*sum(y))^2/
((count(y) * sum(r1*r1) - sum(r1)^2) * (count(y) * sum(y*y) - sum(y)^2)))::float INTO arsq FROM data2;
SELECT |/ ( (count(y)-2) / ( 1/
( ((count(y) * sum(r1*y) - sum(r1)*sum(y))^2/
((count(y) * sum(r1*r1) - sum(r1)^2) * (count(y) * sum(y*y) - sum(y)^2))) ) - 1) ) ::float INTO atst1 FROM data2;
atst = array[ atst1, acoef[2]/( (acoef[1]/atst1)*sqrt(xsq/n)) ];
result = (MADLIB_SCHEMA.array_sum( MADLIB_SCHEMA.array_sub( MADLIB_SCHEMA.array_scalar_mult(lr.coef,1000::FLOAT) , MADLIB_SCHEMA.array_scalar_mult(acoef,1000::FLOAT) ) ))::INT
+((lr.r2*1000) - (arsq*1000))::INT
+(MADLIB_SCHEMA.array_sum( MADLIB_SCHEMA.array_sub( MADLIB_SCHEMA.array_scalar_mult(lr.t_stats,1000::FLOAT) , MADLIB_SCHEMA.array_scalar_mult(atst,1000::FLOAT) ) ))::INT;
IF (result != 0) THEN
RAISE EXCEPTION 'Incorrect univariate results';
END IF;
--check multivariate results
lr := (SELECT MADLIB_SCHEMA.linregr(y,array[r1,r2,1]) FROM data2);
result = abs(lr.coef[1]*10 - 34)::INT;
result2 = abs(lr.coef[2]*10 - 25)::INT;
IF (result > 5) OR (result2 > 5) OR (lr.r2 < 0.9) THEN
RAISE EXCEPTION 'Incorrect multivariate results, got %,%',lr.coef,lr.r2;
END IF;
RAISE INFO 'Linear regression install checks passed';
RETURN;
end
$$ language plpgsql;
SELECT install_test();
-- The following example is taken from:
-- http://www.weibull.com/DOEWeb/estimating_regression_models_using_least_squares.htm#Example%205.1
-- http://www.weibull.com/DOEWeb/hypothesis_tests_in_multiple_linear_regression.htm#Example%205.3
-- No notice message while creating table (creating implicit index).
-- SET client_min_messages = error;
CREATE TABLE weibull
(
id INTEGER NOT NULL,
x1 DOUBLE PRECISION,
x2 DOUBLE PRECISION,
y DOUBLE PRECISION,
CONSTRAINT pk_weibull PRIMARY KEY (id)
);
-- RESET client_min_messages;
COPY weibull (id, x1, x2, y) FROM stdin;
1 41.9 29.1 251.3
2 43.4 29.3 251.3
3 43.9 29.5 248.3
4 44.5 29.7 267.5
5 47.3 29.9 273.0
6 47.5 30.3 276.5
7 47.9 30.5 270.3
8 50.2 30.7 274.9
9 52.8 30.8 285.0
10 53.2 30.9 290.0
11 56.7 31.5 297.0
12 57.0 31.7 302.5
13 63.5 31.9 304.5
14 65.3 32.0 309.3
15 71.1 32.1 321.7
16 77.0 32.5 330.7
17 77.8 32.9 349.0
\.
-- The following example is taken from:
-- http://biocomp.health.unm.edu/biomed505/Course/Cheminformatics/advanced/
-- data_classification_qsar/linear_multilinear_regression.pdf
-- SET client_min_messages = error;
CREATE TABLE unm
(
id INTEGER NOT NULL,
x1 DOUBLE PRECISION,
x2 DOUBLE PRECISION,
y DOUBLE PRECISION,
CONSTRAINT pk_unm PRIMARY KEY (id)
);
-- RESET client_min_messages;
COPY unm (id, x1, x2, y) FROM stdin;
1 0 0.30 10.14
2 0.69 0.60 11.93
3 1.10 0.90 13.57
4 1.39 1.20 14.17
5 1.61 1.50 15.25
6 1.79 1.80 16.15
\.
-- The following example is taken from:
-- http://www.stat.columbia.edu/~martin/W2110/SAS_7.pdf
-- SET client_min_messages = error;
CREATE TABLE houses
(
id SERIAL NOT NULL,
tax INTEGER,
bedroom REAL,
bath REAL,
price INTEGER,
size INTEGER,
lot INTEGER,
CONSTRAINT pk_houses PRIMARY KEY (id)
);
-- RESET client_min_messages;
COPY houses (tax, bedroom, bath, price, size, lot) FROM stdin;
590 2 1 50000 770 22100
1050 3 2 85000 1410 12000
20 3 1 22500 1060 3500
870 2 2 90000 1300 17500
1320 3 2 133000 1500 30000
1350 2 1 90500 820 25700
2790 3 2.5 260000 2130 25000
680 2 1 142500 1170 22000
1840 3 2 160000 1500 19000
3680 4 2 240000 2790 20000
1660 3 1 87000 1030 17500
1620 3 2 118600 1250 20000
3100 3 2 140000 1760 38000
2070 2 3 148000 1550 14000
650 3 1.5 65000 1450 12000
\.
--------------------------------------------------------------------------------
-- Test
--------------------------------------------------------------------------------
-- We convert to real in order to round to fewer significant digits. Otherwise,
-- different platform might produce slightly different results for the least
-- significant digits, depending on the implementation of internal library
-- functions.
select (MADLIB_SCHEMA.linregr(y, array[1, x1, x2])).coef::REAL[] from weibull;
select (MADLIB_SCHEMA.linregr(y, array[1, x1, x2])).r2::REAL from weibull;
select (MADLIB_SCHEMA.linregr(y, array[1, x1, x2])).t_stats::REAL[] from weibull;
select (MADLIB_SCHEMA.linregr(y, array[1, x1, x2])).p_values::REAL[] from weibull;
select (MADLIB_SCHEMA.linregr(y, array[1, x1, x2])).coef::REAL[] from unm;
select (MADLIB_SCHEMA.linregr(y, array[1, x1, x2])).r2::REAL from unm;
select (MADLIB_SCHEMA.linregr(y, array[1, x1, x2])).t_stats::REAL[] from unm;
select (MADLIB_SCHEMA.linregr(y, array[1, x1, x2])).p_values::REAL[] from unm;
select (MADLIB_SCHEMA.linregr(price, array[1, bedroom, bath, size])).coef::REAL[] from houses;
select (MADLIB_SCHEMA.linregr(price, array[1, bedroom, bath, size])).r2::REAL from houses;
select (MADLIB_SCHEMA.linregr(price, array[1, bedroom, bath, size])).t_stats::REAL[] from houses;
select (MADLIB_SCHEMA.linregr(price, array[1, bedroom, bath, size])).p_values::REAL[] from houses;