blob: 21bce42183589f6dd776b283503b3e03f31a3879 [file] [log] [blame]
/* -----------------------------------------------------------------------------
* Test Multinomial Logistic Regression.
* -------------------------------------------------------------------------- */
m4_include(`SQLCommon.m4')
/*
* The following example is taken from:
* http://luna.cas.usf.edu/~mbrannic/files/regression/Logistic.html
* Predicting heart attack. This example is the same as the (binomial) logistic
* regression example.
*/
DROP TABLE IF EXISTS patients;
CREATE TABLE patients (
id INTEGER NOT NULL,
"SECOND_ATTACK" INTEGER,
treatment INTEGER,
trait_anxiety INTEGER
) m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (id)');
INSERT INTO patients(id, "SECOND_ATTACK", treatment, trait_anxiety) VALUES
( 1, 1, 1, 70),
( 2, 1, 1, 80),
( 3, 1, 1, 50),
( 4, 1, 0, 60),
( 5, 1, 0, 40),
( 6, 1, 0, 65),
( 7, 1, 0, 75),
( 8, 1, 0, 80),
( 9, 1, 0, 70),
(10, 1, 0, 60),
(11, 0, 1, 65),
(12, 0, 1, 50),
(13, 0, 1, 45),
(14, 0, 1, 35),
(15, 0, 1, 40),
(16, 0, 1, 50),
(17, 0, 0, 55),
(18, 0, 0, 45),
(19, 0, 0, 50),
(20, 0, 0, 60);
-- This is the same test case from the logistic regression example, just called with
-- multinomial logistic regression.
-- computed with the IRLS optimizer in MADlib
SELECT assert(
relative_error(coef, ARRAY[6.36, 1.02, -0.119]) < 1e-2 AND
relative_error(log_likelihood, -9.41) < 1e-2 AND
relative_error(std_err, ARRAY[3.21, 1.17, 0.0550]) < 0.002 AND
relative_error(z_stats, ARRAY[1.98, 0.874, -2.17]) < 0.002 AND
relative_error(p_values, ARRAY[0.0477, 0.382, 0.0304]) < 1e-3 AND
relative_error(odds_ratios, ARRAY[580.26, 2.78, 0.89]) < 0.01 AND
relative_error(condition_no, 106329) < 1e-2,
'Multinomial Logistic regression with IRLS optimizer (patients test): Wrong results'
) FROM mlogregr(
'patients', '"SECOND_ATTACK"', 'ARRAY[1, treatment, trait_anxiety]', 20,
'irls', 0.0001, 1);
DROP TABLE IF EXISTS temp_result;
DROP TABLE IF EXISTS temp_result_summary;
SELECT mlogregr_train(
'patients',
'temp_result',
'"SECOND_ATTACK"',
'ARRAY[1, treatment, trait_anxiety]');
DROP TABLE IF EXISTS predict_result;
SELECT mlogregr_predict(
'temp_result',
'patients',
'id',
'predict_result'
);
SELECT * FROM predict_result;
/*
* The values given by the multinomial logistic regression were cross checked
* with the Matlab command mnrfit, which is documented at
* http://www.mathworks.com/help/toolbox/stats/mnrfit.html
*
* One important detail in the mnrfit command is that due to a difference in convention,
* its answers for the coefficients are the negative of our coefficient. Our
* convention is chosen to match the convention of the binary
* logistic regression implementation in madlib.
*
* For completeness, the matlab code needed to check the answers to the 'test3' example
* is included below. The code assumes that the data is contained in a csv file
* and that the columns haven't changed order. The coefficients will be in the
* 'B' variable.
*
* BEGIN CODE
*
data = csvread(csvFilename);
N = size(data, 1); % Number of records
J = size(data, 2)-1; % Number of covariates
% Integer encoded categories {0,1...K-1}
int_y = 1+data(:,end); % Categories
x = data(:,1:end-1); % Independant variables
% Pivot around the last data point
[B,dev,stats] = mnrfit(x,int_y)
*
* END CODE
*/
DROP TABLE IF EXISTS test3;
CREATE TABLE test3 (
id SERIAL,
feat1 INTEGER,
feat2 INTEGER,
cat INTEGER
);
INSERT INTO test3(feat1, feat2, cat) VALUES
(1,35,1),
(2,33,0),
(3,39,1),
(1,37,1),
(2,31,1),
(3,36,0),
(2,36,1),
(2,31,1),
(2,41,1),
(2,37,1),
(1,44,1),
(3,33,2),
(1,31,1),
(2,44,1),
(1,35,1),
(1,44,0),
(1,46,0),
(2,46,1),
(2,46,2),
(3,49,1),
(2,39,0),
(2,44,1),
(1,47,1),
(1,44,1),
(1,37,2),
(3,38,2),
(1,49,0),
(2,44,0),
(1,41,2),
(1,50,2),
(2,44,0),
(1,39,1),
(1,40,2),
(1,46,2),
(2,41,1),
(2,39,1),
(2,33,1),
(3,59,2),
(1,41,0),
(2,47,2),
(2,31,0),
(3,42,2),
(1,55,2),
(3,40,1),
(1,44,2),
(1,54,1),
(2,46,1),
(1,54,0),
(2,42,1),
(2,49,2),
(2,41,2),
(2,41,1),
(1,44,0),
(1,57,2),
(2,52,2),
(1,49,0),
(3,41,2),
(3,57,0),
(1,62,1),
(3,33,0),
(2,54,1),
(2,40,2),
(3,52,2),
(2,57,1),
(2,49,1),
(2,46,1),
(1,57,0),
(2,49,2),
(2,52,2),
(2,53,0),
(3,54,2),
(2,57,2),
(3,41,2),
(1,52,0),
(2,57,1),
(1,54,0),
(2,52,1),
(2,52,0),
(2,44,0),
(2,46,2),
(1,49,1),
(2,54,2),
(3,52,2),
(1,44,0),
(3,49,1),
(1,46,2),
(2,54,0),
(2,39,0),
(2,59,0),
(2,45,1),
(3,52,1),
(3,54,0),
(3,44,1),
(2,50,2),
(2,62,1),
(2,59,0),
(2,52,2),
(2,52,1),
(2,46,1),
(2,41,0),
(2,52,2),
(2,52,1),
(2,55,1),
(2,41,1),
(2,49,0),
(1,59,2),
(1,54,0),
(2,54,0),
(2,59,2),
(2,55,2),
(1,62,2),
(2,54,2),
(2,54,2),
(2,54,2),
(2,59,2),
(2,57,1),
(3,61,2),
(3,52,2),
(2,59,2),
(2,62,2),
(1,60,1),
(2,59,2),
(2,65,2),
(3,61,2),
(2,59,2),
(3,59,2),
(2,59,2),
(2,59,2),
(2,65,2),
(3,57,2),
(2,59,2),
(3,49,2),
(1,49,0),
(3,59,2),
(2,62,2),
(3,59,0),
(2,54,2),
(3,63,2),
(1,43,2),
(3,54,2),
(3,52,2),
(1,57,2),
(2,57,0),
(2,57,0),
(2,61,2),
(2,62,0),
(2,62,0),
(1,65,0),
(2,57,2),
(3,59,2),
(2,59,2),
(3,62,2),
(2,65,2),
(2,62,1),
(1,62,0),
(2,62,2),
(3,54,2),
(3,62,2),
(1,65,2),
(3,62,2),
(3,67,0),
(3,65,0),
(1,60,2),
(3,59,2),
(2,59,2),
(2,59,1),
(3,65,0),
(3,62,2),
(3,65,2),
(3,59,0),
(1,59,0),
(3,61,2),
(1,65,2),
(3,67,1),
(3,65,2),
(1,65,2),
(2,67,2),
(1,65,2),
(1,62,2),
(3,52,2),
(3,63,2),
(2,59,2),
(3,65,2),
(2,59,0),
(3,67,2),
(3,67,2),
(3,60,2),
(3,67,2),
(3,62,2),
(2,54,2),
(3,65,2),
(3,62,2),
(2,59,2),
(3,60,2),
(3,63,2),
(3,65,2),
(2,63,1),
(2,67,2),
(2,65,2),
(2,62,2);
SELECT assert(
relative_error(coef, ARRAY[3.579, 5.99, -0.636, -0.451, -0.0581, -0.112]) < 1e-2 AND
relative_error(log_likelihood, -182.22) < 1e-2 AND
relative_error(std_err, ARRAY[1.219, 1.209, 0.266, 0.273, 0.0214, 0.0216]) < 0.02 AND
relative_error(z_stats, ARRAY[2.935, 4.953, -2.3912, -1.653, -2.710, -5.171]) < 0.02 AND
relative_error(p_values, ARRAY[0.003326, 7.3019e-07, 0.0167, 0.0982, 0.006726, 2.3177e-07]) < 1e-2 AND
relative_error(odds_ratios, ARRAY[35.85, 399.42, 0.53, 0.64, 0.94, 0.89]) < 1e-2 AND
relative_error(condition_no, 256089) < 1e-2,
'Multinomial Logistic regression with IRLS optimizer (test): Wrong results'
) FROM mlogregr(
'test3', 'cat', 'ARRAY[1, feat1, feat2]', 20, 'irls', 0.001, 2
);
DROP TABLE IF EXISTS temp_result;
DROP TABLE IF EXISTS temp_result_summary;
SELECT mlogregr_train(
'test3',
'temp_result',
'cat',
'ARRAY[1, feat1, feat2]');
DROP TABLE IF EXISTS predict_result;
SELECT mlogregr_predict(
'temp_result',
'test3',
'id',
'predict_result',
'response');
select * from predict_result;
DROP TABLE IF EXISTS predict_result;
SELECT mlogregr_predict(
'temp_result',
'test3',
'id',
'predict_result',
'prob');
select * from predict_result;
DROP TABLE IF EXISTS predict_result;
SELECT mlogregr_predict(
'temp_result',
'test3',
'ARRAY[1, feat1, feat2]',
'predict_result',
'response');
select * from predict_result;
DROP TABLE IF EXISTS predict_result;
SELECT mlogregr_predict(
'temp_result',
'test3',
'ARRAY[1, feat1, feat2]',
'predict_result',
'prob');
select * from predict_result;
-- NULL handling test
DROP TABLE IF EXISTS patients_with_null;
CREATE TABLE patients_with_null (
id INTEGER NOT NULL,
second_attack INTEGER,
treatment INTEGER,
trait_anxiety INTEGER
) m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (id)');
INSERT INTO patients_with_null(ID, second_attack, treatment, trait_anxiety) VALUES
( 1, 1, 1, 70),
( 2, 1, 1, 80),
( 3, 1, 1, 50),
( 4, 1, 0, 60),
( 5, 1, 0, 40),
( 6, 1, 0, 65),
( 7, 1, 0, 75),
( 8, 1, 0, 80),
( 9, 1, 0, 70),
(10, 1, 0, 60),
(11, 0, 1, 65),
(12, 0, 1, 50),
(13, 0, 1, 45),
(14, 0, 1, 35),
(15, 0, 1, 40),
(16, 0, 1, 50),
(17, 0, 0, 55),
(18, 0, 0, 45),
(19, 0, 0, 50),
(20, 0, 0, 60),
(21, NULL, 0, 45),
(22, 0, NULL, 50),
(23, 0, 0, NULL);
-- This is the same test case from the logistic regression example, just called with
-- multinomial logistic regression.
-- computed with the IRLS optimizer in MADlib
SELECT assert(
relative_error(coef, ARRAY[6.36, 1.02, -0.119]) < 1e-2 AND
relative_error(log_likelihood, -9.41) < 1e-2 AND
relative_error(std_err, ARRAY[3.21, 1.17, 0.0550]) < 0.002 AND
relative_error(z_stats, ARRAY[1.98, 0.874, -2.17]) < 0.002 AND
relative_error(p_values, ARRAY[0.0477, 0.382, 0.0304]) < 1e-3 AND
relative_error(odds_ratios, ARRAY[580.26, 2.78, 0.89]) < 0.01 AND
relative_error(condition_no, 106329) < 1e-2 AND
num_processed = 20,
'Multinomial Logistic regression with IRLS optimizer (patients test): Wrong results'
) FROM mlogregr(
'patients_with_null', 'second_attack', 'ARRAY[1, treatment, trait_anxiety]', 20,
'irls', 0.0001, 1);
DROP TABLE IF EXISTS temp_result;
DROP TABLE IF EXISTS temp_result_summary;
SELECT mlogregr_train(
'patients_with_null',
'temp_result',
'second_attack',
'ARRAY[1, treatment, trait_anxiety]');
-- All NULL input
DROP TABLE IF EXISTS patients_all_null;
CREATE TABLE patients_all_null (
id INTEGER NOT NULL,
second_attack INTEGER,
treatment INTEGER,
trait_anxiety INTEGER
)m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (id)');
INSERT INTO patients_all_null(ID, second_attack, treatment, trait_anxiety) VALUES
( 1, NULL, 1, 70),
( 2, 1, 1, NULL),
( 3, 1, 1, NULL),
( 4, 1, 0, NULL),
( 5, 1, 0, NULL),
( 6, 1, 0, NULL),
( 7, 1, 0, NULL),
( 8, 1, 0, NULL),
( 9, 1, 0, NULL),
(10, 1, 0, NULL),
(11, NULL, 1, 65),
(12, 0, NULL, 50),
(13, 0, NULL, 45),
(14, 0, NULL, 35),
(15, 0, NULL, 40),
(16, 0, NULL, 50),
(17, 0, NULL, 55),
(18, 0, NULL, 45),
(19, 0, NULL, 50),
(20, 0, NULL, 60);
SELECT mlogregr(
'patients_all_null',
'second_attack',
'ARRAY[1, treatment, trait_anxiety]',
20, 'irls', 0.0001, 1);