blob: e91c71a7cae2e8ec3572475bd68ff6e29c156767 [file] [log] [blame]
DROP TABLE IF EXISTS dt_golf CASCADE;
CREATE TABLE dt_golf (
id integer NOT NULL,
"OUTLOOK" text,
temperature double precision,
humidity double precision,
windy boolean,
class text
) ;
INSERT INTO dt_golf (id,"OUTLOOK",temperature,humidity,windy,class) VALUES
(1, 'sunny', 85, 85, false, 'Don''t Play'),
(2, 'sunny', 80, 90, true, 'Don''t Play'),
(3, 'overcast', 83, 78, false, 'Play'),
(4, 'rain', 70, 96, false, 'Play'),
(5, 'rain', 68, 80, false, 'Play'),
(6, 'rain', 65, 70, true, 'Don''t Play'),
(7, 'overcast', 64, 65, true, 'Play'),
(8, 'sunny', 72, 95, false, 'Don''t Play'),
(9, 'sunny', 69, 70, false, 'Play'),
(10, 'rain', 75, 80, false, 'Play'),
(11, 'sunny', 75, 70, true, 'Play'),
(12, 'overcast', 72, 90, true, 'Play'),
(13, 'overcast', 81, 75, false, 'Play'),
(14, 'rain', 71, 80, true, 'Don''t Play');
-------------------------------------------------------------------------
DROP TABLE IF EXISTS train_output, train_output_summary, train_output_group;
SELECT forest_train(
'dt_golf'::TEXT, -- source table
'train_output'::TEXT, -- output model table
'id'::TEXT, -- id column
'class'::TEXT, -- response
'windy, temperature'::TEXT, -- features
NULL::TEXT, -- exclude columns
NULL::TEXT, -- no grouping
5, -- num of trees
1, -- num of random features
TRUE::BOOLEAN, -- importance
1::INTEGER, -- num_permutations
10::INTEGER, -- max depth
1::INTEGER, -- min split
1::INTEGER, -- min bucket
8::INTEGER, -- number of bins per continuous variable
'max_surrogates=0',
FALSE
);
\x on
SELECT * from train_output_summary;
SELECT * from train_output_group;
-------------------------------------------------------------------------
DROP TABLE IF EXISTS train_output, train_output_summary, train_output_group, train_output_poisson_count;
SELECT forest_train(
'dt_golf'::TEXT, -- source table
'train_output'::TEXT, -- output model table
'id'::TEXT, -- id column
'temperature::double precision'::TEXT, -- response
'humidity, windy'::TEXT, -- features
NULL::TEXT, -- exclude columns
'class', -- grouping
5, -- num of trees
1, -- num of random features
TRUE::BOOLEAN, -- importance
20::INTEGER, -- num_permutations
10::INTEGER, -- max depth
1::INTEGER, -- min split
1::INTEGER, -- min bucket
3::INTEGER, -- number of bins per continuous variable
'max_surrogates = 2 ',
FALSE
);
\x on
SELECT * from train_output_summary;
SELECT * from train_output_group;
SELECT assert(count(distinct(gid)) = 2, 'gid is not correctly generated!') FROM train_output;
-------------------------------------------------------------------------
-- using y to predict y for the sake of testing variable importance
DROP TABLE IF EXISTS train_output, train_output_summary, train_output_group;
SELECT forest_train(
'dt_golf'::TEXT, -- source table
'train_output'::TEXT, -- output model table
'id'::TEXT, -- id column
'class::TEXT'::TEXT, -- response
'class, windy, temperature'::TEXT, -- features
NULL::TEXT, -- exclude columns
NULL::TEXT, -- no grouping
100, -- num of trees
1, -- num of random features
TRUE::BOOLEAN, -- importance
1::INTEGER, -- num_permutations
10::integer, -- max depth
1::integer, -- min split
1::integer, -- min bucket
8::integer, -- number of bins per continuous variable
'max_surrogates=0',
FALSE
);
\x on
SELECT * from train_output_summary;
SELECT * from train_output_group;
SELECT
assert(cat_var_importance[1] > con_var_importance[1], 'class should be important!'),
assert(cat_var_importance[1] > con_var_importance[2], 'class should be important!')
FROM train_output_group;
-------------------------------------------------------------------------
-- using y to predict y for the sake of testing variable importance
DROP TABLE IF EXISTS train_output, train_output_summary, train_output_group;
SELECT forest_train(
'dt_golf'::TEXT, -- source table
'train_output'::TEXT, -- output model table
'id'::TEXT, -- id column
'temperature::double precision'::TEXT, -- response
'class, temperature, windy'::TEXT, -- features
NULL::TEXT, -- exclude columns
NULL::TEXT, -- no grouping
100, -- num of trees
1, -- num of random features
TRUE::BOOLEAN, -- importance
1::INTEGER, -- num_permutations
10::integer, -- max depth
1::integer, -- min split
1::integer, -- min bucket
8::integer, -- number of bins per continuous variable
'max_surrogates=0',
FALSE
);
\x on
SELECT * from train_output_summary;
SELECT * from train_output_group;
SELECT assert(oob_error < 100.0, 'oob_error is larger than 100.0!') FROM train_output_group;
SELECT
assert(con_var_importance[1] > cat_var_importance[1], 'temperature should be important!'),
assert(con_var_importance[1] > cat_var_importance[2], 'temperature should be important!')
FROM train_output_group;
-------------------------------------------------------------------------
DROP TABLE IF EXISTS train_output, train_output_summary, train_output_group, train_output_poisson_count;
SELECT forest_train(
'dt_golf'::TEXT, -- source table
'train_output'::TEXT, -- output model table
'id'::TEXT, -- id column
'temperature::double precision'::TEXT, -- response
'humidity'::TEXT, -- features
NULL::TEXT, -- exclude columns
'class,windy', -- grouping
5, -- num of trees
1, -- num of random features
TRUE::BOOLEAN, -- importance
1::INTEGER, -- num_permutations
10::integer, -- max depth
1::integer, -- min split
1::integer, -- min bucket
2::integer, -- number of bins per continuous variable
'max_surrogates=0',
FALSE
);
\x on
SELECT * from train_output_summary;
SELECT * from train_output_group;
----------------------------------------------------------------------------
DROP TABLE IF EXISTS predict_output;
SELECT forest_predict(
'train_output',
'dt_golf',
'predict_output',
'response'
);
SELECT * from predict_output;
----------------------------------------------------------------------------
DROP TABLE IF EXISTS train_output, train_output_summary, train_output_group;
SELECT forest_train(
'dt_golf'::TEXT, -- source table
'train_output'::TEXT, -- output model table
'id'::TEXT, -- id column
'class'::TEXT, -- response
'humidity, temperature'::TEXT, -- features
NULL::TEXT, -- exclude columns
NULL::TEXT, -- no grouping
5, -- num of trees
1, -- num of random features
FALSE::BOOLEAN, -- importance
1::INTEGER, -- num_permutations
10::integer, -- max depth
1::integer, -- min split
1::integer, -- min bucket
8::integer, -- number of bins per continuous variable
'max_surrogates=0',
FALSE
);
DROP TABLE IF EXISTS predict_output;
SELECT forest_predict(
'train_output',
'dt_golf',
'predict_output',
'prob'
);
SELECT * from predict_output;
SELECT
assert(
"estimated_prob_Don't Play" + "estimated_prob_Play" = 1,
'probabilities do not add to 1')
FROM predict_output;
-------------------------------------------------------------------------
-- smaller bootstrap sample size
DROP TABLE IF EXISTS train_output, train_output_summary, train_output_group;
SELECT forest_train(
'dt_golf'::TEXT, -- source table
'train_output'::TEXT, -- output model table
'id'::TEXT, -- id column
'class'::TEXT, -- response
'humidity, temperature'::TEXT, -- features
NULL::TEXT, -- exclude columns
NULL::TEXT, -- no grouping
5, -- num of trees
1, -- num of random features
TRUE::BOOLEAN, -- importance
1::INTEGER, -- num_permutations
10::INTEGER, -- max depth
1::INTEGER, -- min split
1::INTEGER, -- min bucket
3::INTEGER, -- number of bins per continuous variable
'max_surrogates=0',
TRUE,
.5
);
\x on
SELECT * from train_output_summary;
SELECT * from train_output_group;
\x off
DROP TABLE IF EXISTS predict_output;
SELECT forest_predict(
'train_output',
'dt_golf',
'predict_output',
'prob'
);
SELECT * from predict_output;
SELECT get_tree('train_output', 1, sid)
from generate_series(1, 5) sid;
-------------------------------------------------------------------------
-- Test case for the case where a group has a filtered feature
DROP TABLE IF EXISTS rf_gr_test CASCADE;
CREATE TABLE rf_gr_test (
id integer NOT NULL,
gr integer,
f1 integer,
f2 integer, --filtered for gr = 2
f3 double precision,
cl integer
) ;
INSERT INTO rf_gr_test (id,gr,f1,f2,f3,cl) VALUES
(1,1,1,1,5,1),
(2,1,2,2,4,2),
(3,1,3,3,3,1),
(4,2,4,4,2,2),
(5,2,5,4,1,1);
DROP TABLE IF EXISTS train_output, train_output_summary, train_output_group;
SELECT forest_train(
'rf_gr_test'::TEXT, -- source table
'train_output'::TEXT, -- output model table
'id'::TEXT, -- id column
'cl'::TEXT, -- response
'f1, f2, f3'::TEXT, -- features
NULL::TEXT, -- exclude columns
'gr'::TEXT, -- no grouping
2, -- num of trees
1, -- num of random features
TRUE::BOOLEAN, -- importance
1::INTEGER, -- num_permutations
10::INTEGER, -- max depth
1::INTEGER, -- min split
1::INTEGER, -- min bucket
2::INTEGER, -- number of bins per continuous variable
'max_surrogates=0',
FALSE
);