blob: a0da2fe7706981b730b1723d3b8e2991aa14d1e7 [file] [log] [blame]
---------------------------------------------------------------------------
-- Rules:
-- ------
-- 1) Any DB objects should be created w/o schema prefix,
-- since this file is executed in a separate schema context.
-- 2) There should be no DROP statements in this script, since
-- all objects created in the default schema will be cleaned-up outside.
---------------------------------------------------------------------------
---------------------------------------------------------------------------
-- Setup:
---------------------------------------------------------------------------
-- Creating artificial random data =============================
-- For the model
-- E[y] = logit^(-1) ( x^T * [10, -2.4, 3.1, 0.6, -1.6, 1.3] )
-- we generate 5000 rows (y, x[]), where each x[1] = 1 and each x[i],
-- for i = 2,...,6, is a random variate corresponding to normal distribution
-- with mean 0 and standard deviation 1.
CREATE TABLE artificiallogreg
(
id SERIAL NOT NULL,
y BOOLEAN,
x REAL[]
);
-- Create an array of normally distributed random variates
-- We use the Marsaglia polar method
CREATE OR REPLACE FUNCTION randomNormalArray(n INTEGER)
RETURNS DOUBLE PRECISION[] AS $$
DECLARE
u DOUBLE PRECISION;
v DOUBLE PRECISION;
s DOUBLE PRECISION;
x DOUBLE PRECISION;
y DOUBLE PRECISION;
i INTEGER;
theArray DOUBLE PRECISION[];
BEGIN
FOR i IN 1..(n+1)/2 LOOP
LOOP
u = random() * 2. - 1.;
v = random() * 2. - 1.;
s = u * u + v * v;
EXIT WHEN s < 1.;
END LOOP;
x = u * sqrt(-2. * ln(s) / s);
y = v * sqrt(-2. * ln(s) / s);
theArray[2 * i - 1] = x;
IF 2 * i <= n THEN
theArray[2 * i] = y;
END IF;
END LOOP;
RETURN theArray;
END;
$$ LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION dotProduct(
vec1 DOUBLE PRECISION[],
vec2 DOUBLE PRECISION[])
RETURNS DOUBLE PRECISION AS $$
DECLARE
sum DOUBLE PRECISION;
dim INTEGER;
i INTEGER;
BEGIN
IF array_lower(vec1, 1) != 1 OR array_lower(vec2, 1) != 1 OR
array_upper(vec1, 1) != array_upper(vec2, 1) THEN
RETURN NULL;
END IF;
dim = array_upper(vec1, 1);
sum = 0.;
FOR i in 1..dim LOOP
sum := sum + vec1[i] * vec2[i];
END LOOP;
RETURN sum;
END;
$$ LANGUAGE plpgsql;
CREATE TABLE randomdata AS
SELECT id, array[1]::REAL[] || randomNormalArray(5)::REAL[] AS x
FROM generate_series(1,5000) AS id;
INSERT INTO artificiallogreg (y, x)
SELECT
random() < 1. / (1. + exp(-dotProduct(r.x, c.coef))),
r.x
FROM randomdata AS r, (SELECT array[10, -2.4, 3.1, 0.6, -1.6, 1.3] AS coef) AS c;
-- Calculate the Log Reg
SELECT * FROM MADLIB_SCHEMA.logregr(
'artificiallogreg', 'y', 'x', 20, 'irls', 0.001
);
--------------------------------------------------------------------------------
-- Test 1: Predicting heart attack
--------------------------------------------------------------------------------
-- The following example is taken from:
-- http://luna.cas.usf.edu/~mbrannic/files/regression/Logistic.html
CREATE TABLE patients (
id INTEGER NOT NULL,
second_attack INTEGER,
treatment INTEGER,
trait_anxiety INTEGER,
CONSTRAINT pk_patient PRIMARY key (id)
);
COPY patients (ID, second_attack, treatment, trait_anxiety) FROM stdin;
1 1 1 70
2 1 1 80
3 1 1 50
4 1 0 60
5 1 0 40
6 1 0 65
7 1 0 75
8 1 0 80
9 1 0 70
10 1 0 60
11 0 1 65
12 0 1 50
13 0 1 45
14 0 1 35
15 0 1 40
16 0 1 50
17 0 0 55
18 0 0 45
19 0 0 50
20 0 0 60
\.
CREATE VIEW patients_view AS
SELECT array[ 1, treatment, trait_anxiety]::float[] AS x, second_attack::boolean AS y
FROM patients;
-- Calculate the Log Reg
SELECT * FROM MADLIB_SCHEMA.logregr(
'patients_view', 'y', 'x', 20, 'irls', 0.001
);
--------------------------------------------------------------------------------
-- Test 2: Crime data
--------------------------------------------------------------------------------
-- The following example is taken from:
-- http://www.ats.ucla.edu/stat/stata/output/old/lognoframe.htm
/*
Description
These data are crime-related and demographic statistics for 47 US states in
1960. The data were collected from the FBI's Uniform Crime Report and other
government agencies to determine how the variable crime rate depends on the
other variables measured in the study.
Number of cases: 47
CrimeRat: Crime rate: # of offenses reported to police per million population
MaleTeen: The number of males of age 14-24 per 1000 population
South : Indicator variable for Southern states (0 = No, 1 = Yes)
Educ : Mean # of years of schooling for persons of age 25 or older
Police60: 1960 per capita expenditure on police by state and local government
Police59: 1959 per capita expenditure on police by state and local government
Labor : Labor force participation rate per 1000 civilian urban males age 14-24
Males : The number of males per 1000 females
Pop : State population size in hundred thousands
NonWhite: The number of non-whites per 1000 population
Unemp1 : Unemployment rate of urban males per 1000 of age 14-24
Unemp2 : Unemployment rate of urban males per 1000 of age 35-39
Median : Median value of transferable goods and assets or family income in tens of $
BelowMed: The number of families per 1000 earning below 1/2 the median income
*/
CREATE TABLE crime (
id SERIAL NOT NULL,
CrimeRat DOUBLE PRECISION,
MaleTeen INTEGER,
South SMALLINT,
Educ DOUBLE PRECISION,
Police60 INTEGER,
Police59 INTEGER,
Labor INTEGER,
Males INTEGER,
Pop INTEGER,
NonWhite INTEGER,
Unemp1 INTEGER,
Unemp2 INTEGER,
Median INTEGER,
BelowMed INTEGER,
CONSTRAINT pk_crime PRIMARY key (id)
);
COPY crime (
CrimeRat, MaleTeen, South, Educ, Police60, Police59, Labor, Males, Pop,
NonWhite, Unemp1, Unemp2, Median, BelowMed
) FROM stdin;
79.1 151 1 9.1 58 56 510 950 33 301 108 41 394 261
163.5 143 0 11.3 103 95 583 1012 13 102 96 36 557 194
57.8 142 1 8.9 45 44 533 969 18 219 94 33 318 250
196.9 136 0 12.1 149 141 577 994 157 80 102 39 673 167
123.4 141 0 12.1 109 101 591 985 18 30 91 20 578 174
68.2 121 0 11.0 118 115 547 964 25 44 84 29 689 126
96.3 127 1 11.1 82 79 519 982 4 139 97 38 620 168
155.5 131 1 10.9 115 109 542 969 50 179 79 35 472 206
85.6 157 1 9.0 65 62 553 955 39 286 81 28 421 239
70.5 140 0 11.8 71 68 632 1029 7 15 100 24 526 174
167.4 124 0 10.5 121 116 580 966 101 106 77 35 657 170
84.9 134 0 10.8 75 71 595 972 47 59 83 31 580 172
51.1 128 0 11.3 67 60 624 972 28 10 77 25 507 206
66.4 135 0 11.7 62 61 595 986 22 46 77 27 529 190
79.8 152 1 8.7 57 53 530 986 30 72 92 43 405 264
94.6 142 1 8.8 81 77 497 956 33 321 116 47 427 247
53.9 143 0 11.0 66 63 537 977 10 6 114 35 487 166
92.9 135 1 10.4 123 115 537 978 31 170 89 34 631 165
75.0 130 0 11.6 128 128 536 934 51 24 78 34 627 135
122.5 125 0 10.8 113 105 567 985 78 94 130 58 626 166
74.2 126 0 10.8 74 67 602 984 34 12 102 33 557 195
43.9 157 1 8.9 47 44 512 962 22 423 97 34 288 276
121.6 132 0 9.6 87 83 564 953 43 92 83 32 513 227
96.8 131 0 11.6 78 73 574 1038 7 36 142 42 540 176
52.3 130 0 11.6 63 57 641 984 14 26 70 21 486 196
199.3 131 0 12.1 160 143 631 1071 3 77 102 41 674 152
34.2 135 0 10.9 69 71 540 965 6 4 80 22 564 139
121.6 152 0 11.2 82 76 571 1018 10 79 103 28 537 215
104.3 119 0 10.7 166 157 521 938 168 89 92 36 637 154
69.6 166 1 8.9 58 54 521 973 46 254 72 26 396 237
37.3 140 0 9.3 55 54 535 1045 6 20 135 40 453 200
75.4 125 0 10.9 90 81 586 964 97 82 105 43 617 163
107.2 147 1 10.4 63 64 560 972 23 95 76 24 462 233
92.3 126 0 11.8 97 97 542 990 18 21 102 35 589 166
65.3 123 0 10.2 97 87 526 948 113 76 124 50 572 158
127.2 150 0 10.0 109 98 531 964 9 24 87 38 559 153
83.1 177 1 8.7 58 56 638 974 24 349 76 28 382 254
56.6 133 0 10.4 51 47 599 1024 7 40 99 27 425 225
82.6 149 1 8.8 61 54 515 953 36 165 86 35 395 251
115.1 145 1 10.4 82 74 560 981 96 126 88 31 488 228
88.0 148 0 12.2 72 66 601 998 9 19 84 20 590 144
54.2 141 0 10.9 56 54 523 968 4 2 107 37 489 170
82.3 162 1 9.9 75 70 522 996 40 208 73 27 496 224
103.0 136 0 12.1 95 96 574 1012 29 36 111 37 622 162
45.5 139 1 8.8 46 41 480 968 19 49 135 53 457 249
50.8 126 0 10.4 106 97 599 989 40 24 78 25 593 171
84.9 130 0 12.1 90 91 623 1049 3 22 113 40 588 160
\.
CREATE VIEW crime_prepared AS
SELECT
id,
crimerat >= 110 AS hicrime,
array[1, maleteen, south, educ, police59] AS x
FROM
crime;
---------------------------------------------------------------------------
-- Test
---------------------------------------------------------------------------
-- test function
CREATE OR REPLACE FUNCTION install_test() RETURNS VOID AS $$
declare
result_ll float;
lgres logregr_result;
begin
-- DROP TABLE IF EXISTS data_pre;
CREATE TABLE log_data_pre(x1 float, x2 float, noise float);
INSERT INTO log_data_pre SELECT random()*10-5,random(),random() FROM generate_series(1,3000);
-- DROP TABLE IF EXISTS data CASCADE;
CREATE TABLE log_data(r1 float[],val boolean);
INSERT INTO log_data SELECT array[1,x1,x2], (1/(1 + exp(-(2*x1+5)))) > noise FROM log_data_pre;
SELECT INTO lgres (t).* from (SELECT * from MADLIB_SCHEMA.logregr('log_data','val','r1',100,'irls',0.001)) as t;
IF (abs(lgres.coef[1]-5) > 0.7) OR (abs(lgres.coef[2]-2) > 0.7) OR abs(lgres.coef[3]) > 0.7 THEN
RAISE EXCEPTION 'Incorrect coefficients, got %',lgres.coef;
END IF;
IF (lgres.p_values[1] > 0.01) OR (lgres.p_values[2] > 0.01) OR (lgres.p_values[3]) < 0.01 THEN
RAISE EXCEPTION 'Incorrect p_values, got %',lgres.p_values;
END IF;
-- DROP VIEW IF EXISTS fitdata CASCADE;
EXECUTE 'CREATE VIEW fitdata AS
SELECT val,
(val::INT)*ln(1/(1+exp(-array_dot(r1, array[' ||array_to_string(lgres.coef,',')|| ']::float[]))))
+ (1-val::INT)*ln(1-(1/(1+exp(-array_dot(r1, array[' ||array_to_string(lgres.coef,',')|| ']::float[])))))
as LL FROM log_data';
SELECT INTO result_ll sum(ll) from fitdata;
IF (round(result_ll*100 - lgres.log_likelihood*100) != 0) THEN
RAISE EXCEPTION 'Incorrect loglikelihood, got %, expected %',lgres.log_likelihood,result_ll;
END IF;
RAISE INFO 'Logistic regression install checks passed';
RETURN;
end
$$ language plpgsql;
---------------------------------------------------------------------------
-- Cleanup
---------------------------------------------------------------------------
SELECT install_test();