docs/gitbook/binaryclass/news20

Hivemall Random Forest supports libsvm-like sparse inputs.

Note
This feature, i.e., Sparse input support in Random Forest, is supported since Hivemall v0.5.0 or later._ feature_hashing function is useful to prepare feature vectors for Random Forest.

Training

drop table rf_model;
create table rf_model
as
select
  train_randomforest_classifier(
    features,
    convert_label(label),  -- convert -1/1 to 0/1
    '-trees 50 -seed 71'   -- hyperparameters
  )
from
  train;

Caution
label must be in [0, k) where k is the number of classes.

Prediction

-- SET hivevar:classification=true;

drop table rf_predicted;
create table rf_predicted
as
SELECT
  rowid,
  rf_ensemble(predicted.value, predicted.posteriori, model_weight) as predicted
  -- rf_ensemble(predicted.value, predicted.posteriori) as predicted -- avoid OOB accuracy (i.e., model_weight)
FROM (
  SELECT
    rowid, 
    m.model_weight,
	-- v0.5.0 and later
    tree_predict(m.model_id, m.model, t.features, "-classification") as predicted
    -- before v0.5.0
	-- tree_predict(m.model_id, m.model, t.features, ${classification}) as predicted
  FROM
    rf_model m
    LEFT OUTER JOIN -- CROSS JOIN
    test t
) t1
group by
  rowid
;

Evaluation

WITH submit as (
  select 
    convert_label(t.label) as actual, 
    p.predicted.label as predicted
  from 
    test t 
    JOIN rf_predicted p on (t.rowid = p.rowid)
)
select count(1) / 4996.0
from submit 
where actual = predicted;

0.8112489991993594