---------------------------------------------------------------------------
-- Rules: 
-- ------
-- 1) Any DB objects should be created w/o schema prefix,
--    since this file is executed in a separate schema context.
-- 2) There should be no DROP statements in this script, since
--    all objects created in the default schema will be cleaned-up outside.
---------------------------------------------------------------------------

---------------------------------------------------------------------------
-- Setup: 
---------------------------------------------------------------------------
CREATE FUNCTION install_test() RETURNS VOID AS $$ 
declare
    lr MADLIB_SCHEMA.linregr_result;
    
	n FLOAT;
	xsq FLOAT;

	acoef FLOAT[];
	arsq FLOAT;
	atst1 FLOAT;
	atst2 FLOAT;
	atst FLOAT[];

	result INT;
	result2 INT;
	
begin
	-- DROP TABLE IF EXISTS data;	
	CREATE TABLE data(x1 numeric,x2 float,noise numeric);
	INSERT INTO data(x1,x2,noise) SELECT (random()-0.5),(random()-0.5),(random()-0.5) FROM generate_series(1,100) as a;

	-- DROP TABLE IF EXISTS data2;
	CREATE TABLE data2(r1 float, r2 float, r3 float, y numeric);
	INSERT INTO data2(r1,r2,r3,y) SELECT x1,x2,noise,(x1*3.4+x2*2.5+noise) FROM data;

	lr := (SELECT MADLIB_SCHEMA.linregr(y,array[r1,1]) FROM data2);

	--check univariate results
	SELECT count(y) INTO n FROM data2;
	SELECT sum(r1*r1)::float INTO xsq FROM data2;
	SELECT ARRAY[ ((count(y) * sum(r1*y) - sum(r1)*sum(y))/(count(y) * sum(r1*r1) - sum(r1)^2)),
		((sum(y)*sum(r1*r1) - sum(r1)*sum(r1*y))/(count(y) * sum(r1*r1) - sum(r1)^2))]::float[] INTO acoef FROM data2;
	SELECT ((count(y) * sum(r1*y) - sum(r1)*sum(y))^2/
		((count(y) * sum(r1*r1) - sum(r1)^2) * (count(y) * sum(y*y) - sum(y)^2)))::float INTO arsq FROM data2;
	SELECT |/ ( (count(y)-2) / ( 1/
		( ((count(y) * sum(r1*y) - sum(r1)*sum(y))^2/
		((count(y) * sum(r1*r1) - sum(r1)^2) * (count(y) * sum(y*y) - sum(y)^2))) ) - 1) ) ::float INTO atst1 FROM data2;      	         
	atst = array[ atst1, acoef[2]/( (acoef[1]/atst1)*sqrt(xsq/n)) ];

	result =  (MADLIB_SCHEMA.array_sum( MADLIB_SCHEMA.array_sub( MADLIB_SCHEMA.array_scalar_mult(lr.coef,1000::FLOAT) , MADLIB_SCHEMA.array_scalar_mult(acoef,1000::FLOAT) ) ))::INT 
		+((lr.r2*1000) - (arsq*1000))::INT
		+(MADLIB_SCHEMA.array_sum( MADLIB_SCHEMA.array_sub( MADLIB_SCHEMA.array_scalar_mult(lr.t_stats,1000::FLOAT) , MADLIB_SCHEMA.array_scalar_mult(atst,1000::FLOAT) ) ))::INT; 

	IF (result != 0) THEN
		RAISE EXCEPTION 'Incorrect univariate results';
	END IF;

	--check multivariate results
	lr := (SELECT MADLIB_SCHEMA.linregr(y,array[r1,r2,1]) FROM data2);

	result = abs(lr.coef[1]*10 - 34)::INT;
	result2 = abs(lr.coef[2]*10 - 25)::INT;

	IF (result > 5) OR (result2 > 5) OR (lr.r2 < 0.9) THEN
		RAISE EXCEPTION 'Incorrect multivariate results, got %,%',lr.coef,lr.r2;
	END IF;
	
	RAISE INFO 'Linear regression install checks passed';
	RETURN;
	
end 
$$ language plpgsql;

SELECT install_test();


-- The following example is taken from:
-- http://www.weibull.com/DOEWeb/estimating_regression_models_using_least_squares.htm#Example%205.1
-- http://www.weibull.com/DOEWeb/hypothesis_tests_in_multiple_linear_regression.htm#Example%205.3

-- No notice message while creating table (creating implicit index).
-- SET client_min_messages = error;
CREATE TABLE weibull
(
	id INTEGER NOT NULL,
	x1 DOUBLE PRECISION,
	x2 DOUBLE PRECISION,
	y DOUBLE PRECISION,
	CONSTRAINT pk_weibull PRIMARY KEY (id)
);
-- RESET client_min_messages;

COPY weibull (id, x1, x2, y) FROM stdin;
1	41.9	29.1	251.3
2	43.4	29.3	251.3
3	43.9	29.5	248.3
4	44.5	29.7	267.5
5	47.3	29.9	273.0
6	47.5	30.3	276.5
7	47.9	30.5	270.3
8	50.2	30.7	274.9
9	52.8	30.8	285.0
10	53.2	30.9	290.0
11	56.7	31.5	297.0
12	57.0	31.7	302.5
13	63.5	31.9	304.5
14	65.3	32.0	309.3
15	71.1	32.1	321.7
16	77.0	32.5	330.7
17	77.8	32.9	349.0
\.


-- The following example is taken from:
-- http://biocomp.health.unm.edu/biomed505/Course/Cheminformatics/advanced/
-- data_classification_qsar/linear_multilinear_regression.pdf

-- SET client_min_messages = error;
CREATE TABLE unm
(
	id INTEGER NOT NULL,
	x1 DOUBLE PRECISION,
	x2 DOUBLE PRECISION,
	y DOUBLE PRECISION,
	CONSTRAINT pk_unm PRIMARY KEY (id)
);
-- RESET client_min_messages;

COPY unm (id, x1, x2, y) FROM stdin;
1	0	0.30	10.14
2	0.69	0.60	11.93
3	1.10	0.90	13.57
4	1.39	1.20	14.17
5	1.61	1.50	15.25
6	1.79	1.80	16.15
\.



-- The following example is taken from:
-- http://www.stat.columbia.edu/~martin/W2110/SAS_7.pdf

-- SET client_min_messages = error;
CREATE TABLE houses
(
	id SERIAL NOT NULL,
	tax INTEGER,
	bedroom REAL,
	bath REAL,
	price INTEGER,
	size INTEGER,
	lot INTEGER,
	CONSTRAINT pk_houses PRIMARY KEY (id)
);
-- RESET client_min_messages;

COPY houses (tax, bedroom, bath, price, size, lot) FROM stdin;
590	2	1	50000	770	22100
1050	3	2	85000	1410	12000
20	3	1	22500	1060	3500
870	2	2	90000	1300	17500
1320	3	2	133000	1500	30000
1350	2	1	90500	820	25700
2790	3	2.5	260000	2130	25000
680	2	1	142500	1170	22000
1840	3	2	160000	1500	19000
3680	4	2	240000	2790	20000
1660	3	1	87000	1030	17500
1620	3	2	118600	1250	20000
3100	3	2	140000	1760	38000
2070	2	3	148000	1550	14000
650	3	1.5	65000	1450	12000
\.

--------------------------------------------------------------------------------
-- Test
--------------------------------------------------------------------------------

-- We convert to real in order to round to fewer significant digits. Otherwise,
-- different platform might produce slightly different results for the least
-- significant digits, depending on the implementation of internal library
-- functions.

select (MADLIB_SCHEMA.linregr(y, array[1, x1, x2])).coef::REAL[] from weibull;
select (MADLIB_SCHEMA.linregr(y, array[1, x1, x2])).r2::REAL from weibull;
select (MADLIB_SCHEMA.linregr(y, array[1, x1, x2])).t_stats::REAL[] from weibull;
select (MADLIB_SCHEMA.linregr(y, array[1, x1, x2])).p_values::REAL[] from weibull;

select (MADLIB_SCHEMA.linregr(y, array[1, x1, x2])).coef::REAL[] from unm;
select (MADLIB_SCHEMA.linregr(y, array[1, x1, x2])).r2::REAL from unm;
select (MADLIB_SCHEMA.linregr(y, array[1, x1, x2])).t_stats::REAL[] from unm;
select (MADLIB_SCHEMA.linregr(y, array[1, x1, x2])).p_values::REAL[] from unm;

select (MADLIB_SCHEMA.linregr(price, array[1, bedroom, bath, size])).coef::REAL[] from houses;
select (MADLIB_SCHEMA.linregr(price, array[1, bedroom, bath, size])).r2::REAL from houses;
select (MADLIB_SCHEMA.linregr(price, array[1, bedroom, bath, size])).t_stats::REAL[] from houses;
select (MADLIB_SCHEMA.linregr(price, array[1, bedroom, bath, size])).p_values::REAL[] from houses;

