blob: 5dfd318caed6c8bcaa504544f28a6a6b6fc8a11f [file] [log] [blame]
DROP TABLE IF EXISTS dt_golf CASCADE;
CREATE TABLE dt_golf (
id integer NOT NULL,
"OUTLOOK" text,
temperature double precision,
humidity double precision,
"Cont_features" double precision[],
cat_features text[],
windy boolean,
class text
) ;
INSERT INTO dt_golf (id,"OUTLOOK",temperature,humidity,"Cont_features",cat_features, windy,class) VALUES
(1, 'sunny', 85, 85,ARRAY[85, 85], ARRAY['a', 'b'], false, 'Don''t Play'),
(2, 'sunny', 80, 90, ARRAY[80, 90], ARRAY['a', 'b'], true, 'Don''t Play'),
(3, 'overcast', 83, 78, ARRAY[83, 78], ARRAY['a', 'b'], false, 'Play'),
(4, 'rain', 70, NULL, ARRAY[70, 96], ARRAY['a', 'b'], false, 'Play'),
(5, 'rain', 68, 80, ARRAY[68, 80], ARRAY['a', 'b'], false, 'Play'),
(6, 'rain', NULL, 70, ARRAY[65, 70], ARRAY['a', 'b'], true, 'Don''t Play'),
(7, 'overcast', 64, 65, ARRAY[64, 65], ARRAY['c', 'b'], NULL , 'Play'),
(8, 'sunny', 72, 95, ARRAY[72, 95], ARRAY['a', 'b'], false, 'Don''t Play'),
(9, 'sunny', 69, 70, ARRAY[69, 70], ARRAY['a', 'b'], false, 'Play'),
(10, 'rain', 75, 80, ARRAY[75, 80], ARRAY['a', 'b'], false, 'Play'),
(11, 'sunny', 75, 70, ARRAY[75, 70], ARRAY['a', 'd'], true, 'Play'),
(12, 'overcast', 72, 90, ARRAY[72, 90], ARRAY['c', 'b'], NULL, 'Play'),
(13, 'overcast', 81, 75, ARRAY[81, 75], ARRAY['a', 'b'], false, 'Play'),
(15, NULL, 81, 75, ARRAY[81, 75], ARRAY['a', 'b'], false, 'Play'),
(16, 'overcast', NULL, 75, ARRAY[81, 75], ARRAY['a', 'd'], false, 'Play'),
(14, 'rain', 71, 80, ARRAY[71, 80], ARRAY['c', 'b'], true, 'Don''t Play');
-------------------------------------------------------------------------
-- classification without grouping
DROP TABLE IF EXISTS train_output, train_output_summary, train_output_group;
SELECT forest_train(
'dt_golf', -- source table
'train_output', -- output model table
'id' , -- id column
'class', -- response
'windy, "Cont_features"[1]', -- features
NULL, -- exclude columns
NULL, -- no grouping
5, -- num of trees
2, -- num of random features
TRUE, -- importance
1, -- num_permutations
10, -- max depth
1, -- min split
1, -- min bucket
8, -- number of bins per continuous variable
'max_surrogates=0',
FALSE
);
\x on
SELECT * from train_output_summary;
SELECT count(*) FROM train_output;
SELECT * from train_output_group;
-- classification with grouping
-------------------------------------------------------------------------
DROP TABLE IF EXISTS train_output, train_output_summary, train_output_group, train_output_poisson_count;
SELECT forest_train(
'dt_golf', -- source table
'train_output', -- output model table
'id', -- id column
'temperature::double precision', -- response
'cat_features, windy, "Cont_features"', -- features
NULL, -- exclude columns
'class', -- grouping
5, -- num of trees
5, -- num of random features
TRUE, -- importance
20, -- num_permutations
10, -- max depth
1, -- min split
1, -- min bucket
3, -- number of bins per continuous variable
'max_surrogates = 2 ',
FALSE
);
\x on
SELECT * from train_output_summary;
SELECT * from train_output_group;
-- regression with grouping
-------------------------------------------------------------------------
DROP TABLE IF EXISTS train_output, train_output_summary, train_output_group, train_output_poisson_count;
SELECT forest_train(
'dt_golf', -- source table
'train_output', -- output model table
'id', -- id column
'temperature::double precision', -- response
'humidity', -- features
NULL, -- exclude columns
'class', -- grouping
5, -- num of trees
1, -- num of random features
TRUE, -- importance
1, -- num_permutations
10, -- max depth
1, -- min split
1, -- min bucket
2, -- number of bins per continuous variable
'max_surrogates=0',
FALSE
);
\x on
SELECT * from train_output_summary;
SELECT * from train_output_group;
SELECT assert(count(distinct(gid)) = 2, 'gid is not correctly generated!') FROM train_output;
DROP TABLE IF EXISTS predict_output;
SELECT forest_predict(
'train_output',
'dt_golf',
'predict_output',
'response'
);
SELECT * from predict_output;
-------------------------------------------------------------------------
-- classification - using y to predict y for the sake of testing variable importance
DROP TABLE IF EXISTS train_output, train_output_summary, train_output_group;
SELECT forest_train(
'dt_golf', -- source table
'train_output', -- output model table
'id', -- id column
'class::TEXT', -- response
'class, windy, temperature', -- features
NULL, -- exclude columns
NULL, -- no grouping
10, -- num of trees
1, -- num of random features
TRUE, -- importance
3, -- num_permutations
10, -- max depth
1, -- min split
1, -- min bucket
8, -- number of bins per continuous variable
'max_surrogates=0',
FALSE
);
\x on
SELECT * from train_output_summary;
SELECT * from train_output_group;
-- Get variable and impurity importance scores using helper function.
DROP TABLE IF EXISTS var_output;
SELECT get_var_importance('train_output', 'var_output');
-- Test for impurity importance reporting.
SELECT * FROM var_output;
-------------------------------------------------------------------------
-- regression - using y to predict y for the sake of testing variable importance
DROP TABLE IF EXISTS train_output, train_output_summary, train_output_group;
SELECT forest_train(
'dt_golf', -- source table
'train_output', -- output model table
'id', -- id column
'temperature::double precision', -- response
'class, temperature, windy', -- features
NULL, -- exclude columns
NULL, -- no grouping
10, -- num of trees
1, -- num of random features
TRUE, -- importance
3, -- num_permutations
10, -- max depth
1, -- min split
1, -- min bucket
8, -- number of bins per continuous variable
'max_surrogates=0',
FALSE
);
\x on
SELECT * from train_output_summary;
SELECT * from train_output_group;
SELECT assert(oob_error < 100.0, 'oob_error is larger than 100.0!') FROM train_output_group;
-- SELECT
-- assert(con_var_importance[1] > cat_var_importance[1], 'temperature should be important!'),
-- assert(con_var_importance[1] > cat_var_importance[2], 'temperature should be important!')
-- FROM train_output_group;
----------------------------------------------------------------------------
-- classification without grouping and set importance as FALSE
DROP TABLE IF EXISTS train_output, train_output_summary, train_output_group;
SELECT forest_train(
'dt_golf', -- source table
'train_output', -- output model table
'id', -- id column
'class', -- response
'humidity, temperature', -- features
NULL, -- exclude columns
NULL, -- no grouping
5, -- num of trees
1, -- num of random features
FALSE, -- importance
1, -- num_permutations
10, -- max depth
1, -- min split
1, -- min bucket
8, -- number of bins per continuous variable
'max_surrogates=0',
FALSE
);
DROP TABLE IF EXISTS predict_output;
SELECT forest_predict(
'train_output',
'dt_golf',
'predict_output',
'prob'
);
SELECT * from predict_output;
SELECT
assert(
"estimated_prob_Don't Play" + "estimated_prob_Play" = 1,
'probabilities do not add to 1')
FROM predict_output;
-------------------------------------------------------------------------
-- smaller bootstrap sample size
DROP TABLE IF EXISTS train_output, train_output_summary, train_output_group;
SELECT forest_train(
'dt_golf', -- source table
'train_output', -- output model table
'id', -- id column
'class', -- response
'"Cont_features", humidity, temperature', -- features
NULL, -- exclude columns
NULL, -- no grouping
5, -- num of trees
NULL, -- num of random features
TRUE, -- importance
1, -- num_permutations
10, -- max depth
1, -- min split
1, -- min bucket
3, -- number of bins per continuous variable
'max_surrogates=0',
FALSE,
.5
);
\x on
SELECT * from train_output_summary;
SELECT * from train_output_group;
\x off
DROP TABLE IF EXISTS predict_output;
SELECT forest_predict(
'train_output',
'dt_golf',
'predict_output',
'prob'
);
SELECT * from predict_output;
SELECT get_tree('train_output', 1, sid)
from generate_series(1, 5) sid;
-------------------------------------------------------------------------
-- Test case for the case where a group has a filtered feature
-- Also testing var_importance=TRUE with no continuous features (MADLIB-1219)
DROP TABLE IF EXISTS rf_gr_test CASCADE;
CREATE TABLE rf_gr_test (
id integer NOT NULL,
gr integer,
f1 integer,
f2 integer, --filtered for gr = 2
f3 double precision,
cl integer
);
INSERT INTO rf_gr_test (id,gr,f1,f2,f3,cl) VALUES
(1,1,1,1,5,1),
(2,1,2,2,4,2),
(3,1,3,3,3,1),
(4,2,4,4,2,2),
(5,2,5,4,1,1),
(6,1,1,1,5,1),
(7,1,2,2,4,2),
(8,1,3,3,3,1),
(9,2,4,4,2,2),
(10,2,5,4,1,1),
(11,1,1,1,5,1),
(12,1,2,2,4,2),
(13,1,3,3,3,1),
(14,2,4,4,2,2),
(15,2,5,4,1,1);
DROP TABLE IF EXISTS train_output, train_output_summary, train_output_group;
SELECT forest_train(
'rf_gr_test', -- source table
'train_output', -- output model table
'id', -- id column
'cl', -- response
'f1, f2', -- features
NULL, -- exclude columns
'gr', -- grouping
2, -- num of trees
1, -- num of random features
TRUE, -- importance
1, -- num_permutations
10, -- max depth
1, -- min split
1, -- min bucket
2, -- number of bins per continuous variable
'max_surrogates=0',
FALSE
);
-- Get variable and impurity importance scores using helper function.
DROP TABLE IF EXISTS var_output;
SELECT get_var_importance('train_output', 'var_output');
-- Test for impurity importance reporting.
SELECT * FROM var_output;
SELECT assert(relative_error(
normalize_sum_array(ARRAY[40, 10]::double precision[], 100::double precision), ARRAY[80, 20]) < 1e-6,
'Array not normalizing');