Hivemall Random Forest supports libsvm-like sparse inputs.
Note
This feature, i.e., Sparse input support in Random Forest, is supported since Hivemall v0.5.0 or later._
feature_hashing
function is useful to prepare feature vectors for Random Forest.
drop table rf_model; create table rf_model as select train_randomforest_classifier( features, convert_label(label), -- convert -1/1 to 0/1 '-trees 50 -seed 71' -- hyperparameters ) from train;
Caution
label must be in
[0, k)
wherek
is the number of classes.
-- SET hivevar:classification=true; drop table rf_predicted; create table rf_predicted as SELECT rowid, rf_ensemble(predicted.value, predicted.posteriori, model_weight) as predicted -- rf_ensemble(predicted.value, predicted.posteriori) as predicted -- avoid OOB accuracy (i.e., model_weight) FROM ( SELECT rowid, m.model_weight, -- v0.5.0 and later tree_predict(m.model_id, m.model, t.features, "-classification") as predicted -- before v0.5.0 -- tree_predict(m.model_id, m.model, t.features, ${classification}) as predicted FROM rf_model m LEFT OUTER JOIN -- CROSS JOIN test t ) t1 group by rowid ;
WITH submit as ( select convert_label(t.label) as actual, p.predicted.label as predicted from test t JOIN rf_predicted p on (t.rowid = p.rowid) ) select count(1) / 4996.0 from submit where actual = predicted;
0.8112489991993594