| \qecho ================================================================= |
| \qecho === Running: logistic regression ================================ |
| \qecho ================================================================= |
| \qecho |
| \qecho === Creating artificial random data ============================= |
| \qecho === For the model |
| \qecho === E[y] = logit^(-1) ( x^T * [-2.4, 3.1, 0.6, -1.6, 1.3] ) |
| \qecho === we generate 5000 rows (y, x[]), where each x[i] is a random |
| \qecho === variate corresponding to normal distribution with mean x[i] |
| \qecho === and standard deviation 1. |
| |
| |
| SET client_min_messages = error; |
| |
| DROP TABLE IF EXISTS artificiallogreg; |
| DROP TABLE IF EXISTS randomdata; |
| |
| CREATE TABLE artificiallogreg |
| ( |
| id SERIAL NOT NULL, |
| y BOOLEAN, |
| x REAL[] |
| ); |
| |
| RESET client_min_messages; |
| |
| -- Create an array of normally distributed random variates |
| -- We use the Marsaglia polar method |
| CREATE OR REPLACE FUNCTION randomNormalArray(n INTEGER) |
| RETURNS DOUBLE PRECISION[] AS $$ |
| DECLARE |
| u DOUBLE PRECISION; |
| v DOUBLE PRECISION; |
| s DOUBLE PRECISION; |
| x DOUBLE PRECISION; |
| y DOUBLE PRECISION; |
| i INTEGER; |
| theArray DOUBLE PRECISION[]; |
| BEGIN |
| FOR i IN 1..(n+1)/2 LOOP |
| LOOP |
| u = random() * 2. - 1.; |
| v = random() * 2. - 1.; |
| s = u * u + v * v; |
| EXIT WHEN s < 1.; |
| END LOOP; |
| x = u * sqrt(-2. * ln(s) / s); |
| y = v * sqrt(-2. * ln(s) / s); |
| |
| theArray[2 * i - 1] = x; |
| IF 2 * i <= n THEN |
| theArray[2 * i] = y; |
| END IF; |
| END LOOP; |
| RETURN theArray; |
| END; |
| $$ LANGUAGE plpgsql; |
| |
| CREATE OR REPLACE FUNCTION dotProduct( |
| vec1 DOUBLE PRECISION[], |
| vec2 DOUBLE PRECISION[]) |
| RETURNS DOUBLE PRECISION AS $$ |
| DECLARE |
| sum DOUBLE PRECISION; |
| dim INTEGER; |
| i INTEGER; |
| BEGIN |
| IF array_lower(vec1, 1) != 1 OR array_lower(vec2, 1) != 1 OR |
| array_upper(vec1, 1) != array_upper(vec2, 1) THEN |
| RETURN NULL; |
| END IF; |
| dim = array_upper(vec1, 1); |
| sum = 0.; |
| FOR i in 1..dim LOOP |
| sum := sum + vec1[i] * vec2[i]; |
| END LOOP; |
| RETURN sum; |
| END; |
| $$ LANGUAGE plpgsql; |
| |
| SET client_min_messages = error; |
| CREATE TABLE randomdata AS |
| SELECT id, randomNormalArray(5)::REAL[] AS x |
| FROM generate_series(1,5000) AS id; |
| RESET client_min_messages; |
| |
| INSERT INTO artificiallogreg (y, x) |
| SELECT |
| random() < 1. / (1. + exp(-dotProduct(r.x, c.coef))), |
| r.x |
| FROM randomdata AS r, (SELECT array[-2.4, 3.1, 0.6, -1.6, 1.3] AS coef) AS c; |
| |
| \qecho === Show data sample (top 5 rows only) ========================== |
| |
| SELECT * FROM artificiallogreg LIMIT 5; |
| |
| \qecho === Calculate Coefficients from artificial data: ================ |
| |
| SELECT madlib.logregr_coef( |
| 'artificiallogreg', 'y', 'x', 20, 'irls', 0.001 |
| )::REAL[]; |