| --------------------------------------------------------------------------- |
| -- Rules: |
| -- ------ |
| -- 1) Any DB objects should be created w/o schema prefix, |
| -- since this file is executed in a separate schema context. |
| -- 2) There should be no DROP statements in this script, since |
| -- all objects created in the default schema will be cleaned-up outside. |
| --------------------------------------------------------------------------- |
| |
| --------------------------------------------------------------------------- |
| -- Setup: |
| --------------------------------------------------------------------------- |
| -- Creating artificial random data ============================= |
| -- For the model |
| -- E[y] = logit^(-1) ( x^T * [10, -2.4, 3.1, 0.6, -1.6, 1.3] ) |
| -- we generate 5000 rows (y, x[]), where each x[1] = 1 and each x[i], |
| -- for i = 2,...,6, is a random variate corresponding to normal distribution |
| -- with mean 0 and standard deviation 1. |
| |
| CREATE TABLE artificiallogreg |
| ( |
| id SERIAL NOT NULL, |
| y BOOLEAN, |
| x REAL[] |
| ); |
| |
| -- Create an array of normally distributed random variates |
| -- We use the Marsaglia polar method |
| CREATE OR REPLACE FUNCTION randomNormalArray(n INTEGER) |
| RETURNS DOUBLE PRECISION[] AS $$ |
| DECLARE |
| u DOUBLE PRECISION; |
| v DOUBLE PRECISION; |
| s DOUBLE PRECISION; |
| x DOUBLE PRECISION; |
| y DOUBLE PRECISION; |
| i INTEGER; |
| theArray DOUBLE PRECISION[]; |
| BEGIN |
| FOR i IN 1..(n+1)/2 LOOP |
| LOOP |
| u = random() * 2. - 1.; |
| v = random() * 2. - 1.; |
| s = u * u + v * v; |
| EXIT WHEN s < 1.; |
| END LOOP; |
| x = u * sqrt(-2. * ln(s) / s); |
| y = v * sqrt(-2. * ln(s) / s); |
| |
| theArray[2 * i - 1] = x; |
| IF 2 * i <= n THEN |
| theArray[2 * i] = y; |
| END IF; |
| END LOOP; |
| RETURN theArray; |
| END; |
| $$ LANGUAGE plpgsql; |
| |
| CREATE OR REPLACE FUNCTION dotProduct( |
| vec1 DOUBLE PRECISION[], |
| vec2 DOUBLE PRECISION[]) |
| RETURNS DOUBLE PRECISION AS $$ |
| DECLARE |
| sum DOUBLE PRECISION; |
| dim INTEGER; |
| i INTEGER; |
| BEGIN |
| IF array_lower(vec1, 1) != 1 OR array_lower(vec2, 1) != 1 OR |
| array_upper(vec1, 1) != array_upper(vec2, 1) THEN |
| RETURN NULL; |
| END IF; |
| dim = array_upper(vec1, 1); |
| sum = 0.; |
| FOR i in 1..dim LOOP |
| sum := sum + vec1[i] * vec2[i]; |
| END LOOP; |
| RETURN sum; |
| END; |
| $$ LANGUAGE plpgsql; |
| |
| CREATE TABLE randomdata AS |
| SELECT id, array[1]::REAL[] || randomNormalArray(5)::REAL[] AS x |
| FROM generate_series(1,5000) AS id; |
| |
| INSERT INTO artificiallogreg (y, x) |
| SELECT |
| random() < 1. / (1. + exp(-dotProduct(r.x, c.coef))), |
| r.x |
| FROM randomdata AS r, (SELECT array[10, -2.4, 3.1, 0.6, -1.6, 1.3] AS coef) AS c; |
| |
| -- Calculate the Log Reg |
| SELECT * FROM MADLIB_SCHEMA.logregr( |
| 'artificiallogreg', 'y', 'x', 20, 'irls', 0.001 |
| ); |
| |
| |
| -------------------------------------------------------------------------------- |
| -- Test 1: Predicting heart attack |
| -------------------------------------------------------------------------------- |
| -- The following example is taken from: |
| -- http://luna.cas.usf.edu/~mbrannic/files/regression/Logistic.html |
| |
| CREATE TABLE patients ( |
| id INTEGER NOT NULL, |
| second_attack INTEGER, |
| treatment INTEGER, |
| trait_anxiety INTEGER, |
| CONSTRAINT pk_patient PRIMARY key (id) |
| ); |
| |
| COPY patients (ID, second_attack, treatment, trait_anxiety) FROM stdin; |
| 1 1 1 70 |
| 2 1 1 80 |
| 3 1 1 50 |
| 4 1 0 60 |
| 5 1 0 40 |
| 6 1 0 65 |
| 7 1 0 75 |
| 8 1 0 80 |
| 9 1 0 70 |
| 10 1 0 60 |
| 11 0 1 65 |
| 12 0 1 50 |
| 13 0 1 45 |
| 14 0 1 35 |
| 15 0 1 40 |
| 16 0 1 50 |
| 17 0 0 55 |
| 18 0 0 45 |
| 19 0 0 50 |
| 20 0 0 60 |
| \. |
| |
| CREATE VIEW patients_view AS |
| SELECT array[ 1, treatment, trait_anxiety]::float[] AS x, second_attack::boolean AS y |
| FROM patients; |
| |
| -- Calculate the Log Reg |
| SELECT * FROM MADLIB_SCHEMA.logregr( |
| 'patients_view', 'y', 'x', 20, 'irls', 0.001 |
| ); |
| |
| |
| -------------------------------------------------------------------------------- |
| -- Test 2: Crime data |
| -------------------------------------------------------------------------------- |
| -- The following example is taken from: |
| -- http://www.ats.ucla.edu/stat/stata/output/old/lognoframe.htm |
| /* |
| |
| Description |
| These data are crime-related and demographic statistics for 47 US states in |
| 1960. The data were collected from the FBI's Uniform Crime Report and other |
| government agencies to determine how the variable crime rate depends on the |
| other variables measured in the study. |
| Number of cases: 47 |
| |
| CrimeRat: Crime rate: # of offenses reported to police per million population |
| MaleTeen: The number of males of age 14-24 per 1000 population |
| South : Indicator variable for Southern states (0 = No, 1 = Yes) |
| Educ : Mean # of years of schooling for persons of age 25 or older |
| Police60: 1960 per capita expenditure on police by state and local government |
| Police59: 1959 per capita expenditure on police by state and local government |
| Labor : Labor force participation rate per 1000 civilian urban males age 14-24 |
| Males : The number of males per 1000 females |
| Pop : State population size in hundred thousands |
| NonWhite: The number of non-whites per 1000 population |
| Unemp1 : Unemployment rate of urban males per 1000 of age 14-24 |
| Unemp2 : Unemployment rate of urban males per 1000 of age 35-39 |
| Median : Median value of transferable goods and assets or family income in tens of $ |
| BelowMed: The number of families per 1000 earning below 1/2 the median income |
| |
| */ |
| |
| CREATE TABLE crime ( |
| id SERIAL NOT NULL, |
| CrimeRat DOUBLE PRECISION, |
| MaleTeen INTEGER, |
| South SMALLINT, |
| Educ DOUBLE PRECISION, |
| Police60 INTEGER, |
| Police59 INTEGER, |
| Labor INTEGER, |
| Males INTEGER, |
| Pop INTEGER, |
| NonWhite INTEGER, |
| Unemp1 INTEGER, |
| Unemp2 INTEGER, |
| Median INTEGER, |
| BelowMed INTEGER, |
| CONSTRAINT pk_crime PRIMARY key (id) |
| ); |
| |
| COPY crime ( |
| CrimeRat, MaleTeen, South, Educ, Police60, Police59, Labor, Males, Pop, |
| NonWhite, Unemp1, Unemp2, Median, BelowMed |
| ) FROM stdin; |
| 79.1 151 1 9.1 58 56 510 950 33 301 108 41 394 261 |
| 163.5 143 0 11.3 103 95 583 1012 13 102 96 36 557 194 |
| 57.8 142 1 8.9 45 44 533 969 18 219 94 33 318 250 |
| 196.9 136 0 12.1 149 141 577 994 157 80 102 39 673 167 |
| 123.4 141 0 12.1 109 101 591 985 18 30 91 20 578 174 |
| 68.2 121 0 11.0 118 115 547 964 25 44 84 29 689 126 |
| 96.3 127 1 11.1 82 79 519 982 4 139 97 38 620 168 |
| 155.5 131 1 10.9 115 109 542 969 50 179 79 35 472 206 |
| 85.6 157 1 9.0 65 62 553 955 39 286 81 28 421 239 |
| 70.5 140 0 11.8 71 68 632 1029 7 15 100 24 526 174 |
| 167.4 124 0 10.5 121 116 580 966 101 106 77 35 657 170 |
| 84.9 134 0 10.8 75 71 595 972 47 59 83 31 580 172 |
| 51.1 128 0 11.3 67 60 624 972 28 10 77 25 507 206 |
| 66.4 135 0 11.7 62 61 595 986 22 46 77 27 529 190 |
| 79.8 152 1 8.7 57 53 530 986 30 72 92 43 405 264 |
| 94.6 142 1 8.8 81 77 497 956 33 321 116 47 427 247 |
| 53.9 143 0 11.0 66 63 537 977 10 6 114 35 487 166 |
| 92.9 135 1 10.4 123 115 537 978 31 170 89 34 631 165 |
| 75.0 130 0 11.6 128 128 536 934 51 24 78 34 627 135 |
| 122.5 125 0 10.8 113 105 567 985 78 94 130 58 626 166 |
| 74.2 126 0 10.8 74 67 602 984 34 12 102 33 557 195 |
| 43.9 157 1 8.9 47 44 512 962 22 423 97 34 288 276 |
| 121.6 132 0 9.6 87 83 564 953 43 92 83 32 513 227 |
| 96.8 131 0 11.6 78 73 574 1038 7 36 142 42 540 176 |
| 52.3 130 0 11.6 63 57 641 984 14 26 70 21 486 196 |
| 199.3 131 0 12.1 160 143 631 1071 3 77 102 41 674 152 |
| 34.2 135 0 10.9 69 71 540 965 6 4 80 22 564 139 |
| 121.6 152 0 11.2 82 76 571 1018 10 79 103 28 537 215 |
| 104.3 119 0 10.7 166 157 521 938 168 89 92 36 637 154 |
| 69.6 166 1 8.9 58 54 521 973 46 254 72 26 396 237 |
| 37.3 140 0 9.3 55 54 535 1045 6 20 135 40 453 200 |
| 75.4 125 0 10.9 90 81 586 964 97 82 105 43 617 163 |
| 107.2 147 1 10.4 63 64 560 972 23 95 76 24 462 233 |
| 92.3 126 0 11.8 97 97 542 990 18 21 102 35 589 166 |
| 65.3 123 0 10.2 97 87 526 948 113 76 124 50 572 158 |
| 127.2 150 0 10.0 109 98 531 964 9 24 87 38 559 153 |
| 83.1 177 1 8.7 58 56 638 974 24 349 76 28 382 254 |
| 56.6 133 0 10.4 51 47 599 1024 7 40 99 27 425 225 |
| 82.6 149 1 8.8 61 54 515 953 36 165 86 35 395 251 |
| 115.1 145 1 10.4 82 74 560 981 96 126 88 31 488 228 |
| 88.0 148 0 12.2 72 66 601 998 9 19 84 20 590 144 |
| 54.2 141 0 10.9 56 54 523 968 4 2 107 37 489 170 |
| 82.3 162 1 9.9 75 70 522 996 40 208 73 27 496 224 |
| 103.0 136 0 12.1 95 96 574 1012 29 36 111 37 622 162 |
| 45.5 139 1 8.8 46 41 480 968 19 49 135 53 457 249 |
| 50.8 126 0 10.4 106 97 599 989 40 24 78 25 593 171 |
| 84.9 130 0 12.1 90 91 623 1049 3 22 113 40 588 160 |
| \. |
| |
| CREATE VIEW crime_prepared AS |
| SELECT |
| id, |
| crimerat >= 110 AS hicrime, |
| array[1, maleteen, south, educ, police59] AS x |
| FROM |
| crime; |
| |
| --------------------------------------------------------------------------- |
| -- Test |
| --------------------------------------------------------------------------- |
| -- test function |
| CREATE OR REPLACE FUNCTION install_test() RETURNS VOID AS $$ |
| declare |
| |
| result_ll float; |
| lgres logregr_result; |
| |
| begin |
| -- DROP TABLE IF EXISTS data_pre; |
| CREATE TABLE log_data_pre(x1 float, x2 float, noise float); |
| INSERT INTO log_data_pre SELECT random()*10-5,random(),random() FROM generate_series(1,3000); |
| |
| -- DROP TABLE IF EXISTS data CASCADE; |
| CREATE TABLE log_data(r1 float[],val boolean); |
| INSERT INTO log_data SELECT array[1,x1,x2], (1/(1 + exp(-(2*x1+5)))) > noise FROM log_data_pre; |
| |
| SELECT INTO lgres (t).* from (SELECT * from MADLIB_SCHEMA.logregr('log_data','val','r1',100,'irls',0.001)) as t; |
| |
| IF (abs(lgres.coef[1]-5) > 0.7) OR (abs(lgres.coef[2]-2) > 0.7) OR abs(lgres.coef[3]) > 0.7 THEN |
| RAISE EXCEPTION 'Incorrect coefficients, got %',lgres.coef; |
| END IF; |
| |
| IF (lgres.p_values[1] > 0.01) OR (lgres.p_values[2] > 0.01) OR (lgres.p_values[3]) < 0.01 THEN |
| RAISE EXCEPTION 'Incorrect p_values, got %',lgres.p_values; |
| END IF; |
| |
| -- DROP VIEW IF EXISTS fitdata CASCADE; |
| EXECUTE 'CREATE VIEW fitdata AS |
| SELECT val, |
| (val::INT)*ln(1/(1+exp(-array_dot(r1, array[' ||array_to_string(lgres.coef,',')|| ']::float[])))) |
| + (1-val::INT)*ln(1-(1/(1+exp(-array_dot(r1, array[' ||array_to_string(lgres.coef,',')|| ']::float[]))))) |
| as LL FROM log_data'; |
| |
| SELECT INTO result_ll sum(ll) from fitdata; |
| |
| IF (round(result_ll*100 - lgres.log_likelihood*100) != 0) THEN |
| RAISE EXCEPTION 'Incorrect loglikelihood, got %, expected %',lgres.log_likelihood,result_ll; |
| END IF; |
| |
| RAISE INFO 'Logistic regression install checks passed'; |
| RETURN; |
| |
| end |
| $$ language plpgsql; |
| |
| --------------------------------------------------------------------------- |
| -- Cleanup |
| --------------------------------------------------------------------------- |
| SELECT install_test(); |