This examples gives a basic usage of RandomForest on Hivemall using Kaggle Titanic dataset. The example gives a baseline score without any feature engineering.

Data preparation

create database titanic;
use titanic;

drop table train;
create external table train (
  passengerid int, -- unique id
  survived int, -- target label
  pclass int,
  name string,
  sex string,
  age int,
  sibsp int, -- Number of Siblings/Spouses Aboard
  parch int, -- Number of Parents/Children Aboard
  ticket string,
  fare double,
  cabin string,
  embarked string
) 
ROW FORMAT DELIMITED
   FIELDS TERMINATED BY '|'
   LINES TERMINATED BY '\n'
STORED AS TEXTFILE LOCATION '/dataset/titanic/train';
hadoop fs -rm /dataset/titanic/train/train.csv
awk '{ FPAT="([^,]*)|(\"[^\"]+\")";OFS="|"; } NR >1 {$1=$1;$4=substr($4,2,length($4)-2);print $0}' train.csv | hadoop fs -put - /dataset/titanic/train/train.csv
drop table test_raw;
create external table test_raw (
  passengerid int,
  pclass int,
  name string,
  sex string,
  age int,
  sibsp int, -- Number of Siblings/Spouses Aboard
  parch int, -- Number of Parents/Children Aboard
  ticket string,
  fare double,
  cabin string,
  embarked string
)
ROW FORMAT DELIMITED
   FIELDS TERMINATED BY '|'
   LINES TERMINATED BY '\n'
STORED AS TEXTFILE LOCATION '/dataset/titanic/test_raw';
hadoop fs -rm /dataset/titanic/test_raw/test.csv
awk '{ FPAT="([^,]*)|(\"[^\"]+\")";OFS="|"; } NR >1 {$1=$1;$3=substr($3,2,length($3)-2);print $0}' test.csv | hadoop fs -put - /dataset/titanic/test_raw/test.csv

Data preparation for RandomForest

set hivevar:output_row=true;

drop table train_rf;
create table train_rf
as
WITH train_quantified as (
  select    
    quantify(
      ${output_row}, passengerid, survived, pclass, name, sex, age, sibsp, parch, ticket, fare, cabin, embarked
    ) as (passengerid, survived, pclass, name, sex, age, sibsp, parch, ticket, fare, cabin, embarked)
  from (
    select * from train
    order by passengerid asc
  ) t
)
select
  rand(31) as rnd,
  passengerid, 
  array(pclass, name, sex, age, sibsp, parch, ticket, fare, cabin, embarked) as features,
  survived
from
  train_quantified
;

drop table test_rf;
create table test_rf
as
WITH test_quantified as (
  select 
    quantify(
      output_row, passengerid, pclass, name, sex, age, sibsp, parch, ticket, fare, cabin, embarked
    ) as (passengerid, pclass, name, sex, age, sibsp, parch, ticket, fare, cabin, embarked)
  from (
    -- need training data to assign consistent ids to categorical variables
    select * from (
      select
        1 as train_first, false as output_row, passengerid, pclass, name, sex, age, sibsp, parch, ticket, fare, cabin, embarked
      from
        train
      union all
      select
        2 as train_first, true as output_row, passengerid, pclass, name, sex, age, sibsp, parch, ticket, fare, cabin, embarked
      from
        test_raw
    ) t0
    order by train_first asc, passengerid asc
  ) t1
)
select
  passengerid, 
  array(pclass, name, sex, age, sibsp, parch, ticket, fare, cabin, embarked) as features
from
  test_quantified
;

Training

select guess_attribute_types(pclass, name, sex, age, sibsp, parch, ticket, fare, cabin, embarked) from train limit 1;

Q,C,C,Q,Q,Q,C,Q,C,C

Q and C represent quantitative variable and categorical variables, respectively.

Caution

Note that the output of guess_attribute_types is not perfect. Revise it by your self. For example, pclass is a categorical variable.

set hivevar:attrs=C,C,C,Q,Q,Q,C,Q,C,C;

drop table model_rf;
create table model_rf
AS
select
  train_randomforest_classifier(features, survived, "-trees 500 -attrs ${attrs}") 
from
  train_rf
;

select
  array_sum(var_importance) as var_importance,
  sum(oob_errors) / sum(oob_tests) as oob_err_rate
from
  model_rf;

[137.00242639169272,1194.2140119834373,328.78017188176966,628.2568660509628,200.31275032394072,160.12876797647078,1083.5987543408116,664.1234312561456,422.89449844090393,130.72019667694784] 0.18742985409652077

Prediction

-- SET hivevar:classification=true;
set hive.auto.convert.join=true;
SET hive.mapjoin.optimized.hashtable=false;
SET mapred.reduce.tasks=16;

drop table predicted_rf;
create table predicted_rf
as
SELECT 
  passengerid,
  predicted.label,
  predicted.probability,
  predicted.probabilities
FROM (
  SELECT
    passengerid,
    rf_ensemble(predicted.value, predicted.posteriori, model_weight) as predicted
    -- rf_ensemble(predicted.value, predicted.posteriori) as predicted -- avoid OOB accuracy (i.e., model_weight)
  FROM (
    SELECT
      t.passengerid, 
      p.model_weight,
      tree_predict(p.model_id, p.model, t.features, "-classification") as predicted
      -- tree_predict_v1(p.model_id, p.model_type, p.pred_model, t.features, ${classification}) as predicted -- to use the old model in v0.5.0 or later
    FROM (
      SELECT 
        model_id, model_weight, model
      FROM 
        model_rf 
      DISTRIBUTE BY rand(1)
    ) p
    LEFT OUTER JOIN test_rf t
  ) t1
  group by
    passengerid
) t2
;

Caution

tree_predict_v1 is for the backward compatibility for using prediction models built before v0.5.0 on v0.5.0 or later.

Kaggle submission

drop table predicted_rf_submit;
create table predicted_rf_submit
  ROW FORMAT DELIMITED 
    FIELDS TERMINATED BY ","
    LINES TERMINATED BY "\n"
  STORED AS TEXTFILE
as
SELECT passengerid, label as survived
FROM predicted_rf
ORDER BY passengerid ASC;
hadoop fs -getmerge /user/hive/warehouse/titanic.db/predicted_rf_submit predicted_rf_submit.csv
sed -i -e "1i PassengerId,Survived" predicted_rf_submit.csv

Accuracy would gives 0.76555 for a Kaggle submission.

Graphviz export

Note

tree_export feature is supported from Hivemall v0.5.0 or later. Better to limit tree depth on training by -depth option to plot a Decision Tree.

Hivemall provide tree_export to export a decision tree into Graphviz or human-readable Javascript format. You can find the usage by issuing the following query:

> select tree_export("","-help");

usage: tree_export(string model, const string options, optional
       array<string> featureNames=null, optional array<string>
       classNames=null) - exports a Decision Tree model as javascript/dot]
       [-help] [-output_name <arg>] [-r] [-t <arg>]
 -help                             Show function help
 -output_name,--outputName <arg>   output name [default: predicted]
 -r,--regression                   Is regression tree or not
 -t,--type <arg>                   Type of output [default: js,
                                   javascript/js, graphviz/dot
CREATE TABLE model_exported 
  STORED AS ORC tblproperties("orc.compress"="SNAPPY")
AS
select
  model_id,
  tree_export(model, "-type javascript -output_name survived", array('pclass','name','sex','age','sibsp','parch','ticket','fare','cabin','embarked'), array('no','yes')) as js,
  tree_export(model, "-type graphviz -output_name survived", array('pclass','name','sex','age','sibsp','parch','ticket','fare','cabin','embarked'), array('no','yes')) as dot
from
  model_rf
-- limit 1
;

Here is an example plotting a decision tree using Graphviz or Vis.js.

Test by dividing training dataset

drop table train_rf_07;
create table train_rf_07 
as
select * from train_rf 
where rnd < 0.7;

drop table test_rf_03;
create table test_rf_03
as
select * from train_rf
where rnd >= 0.7;

drop table model_rf_07;
create table model_rf_07
AS
select
  train_randomforest_classifier(features, survived, "-trees 500 -attrs ${attrs}") 
from
  train_rf_07;

select
  array_sum(var_importance) as var_importance,
  sum(oob_errors) / sum(oob_tests) as oob_err_rate
from
  model_rf_07;

[116.12055542977338,960.8569891444097,291.08765260103837,469.74671636586226,163.721292772701,120.784769882858,847.9769298113661,554.4617571355476,346.3500941757221,97.42593940113392] 0.1838351822503962

-- SET hivevar:classification=true;
SET hive.mapjoin.optimized.hashtable=false;
SET mapred.reduce.tasks=16;

drop table predicted_rf_03;
create table predicted_rf_03
as
SELECT 
  passengerid,
  predicted.label,
  predicted.probability,
  predicted.probabilities
FROM (
  SELECT
    passengerid,
    rf_ensemble(predicted.value, predicted.posteriori, model_weight) as predicted
    -- rf_ensemble(predicted.value, predicted.posteriori) as predicted -- avoid OOB accuracy (i.e., model_weight)
  FROM (
    SELECT
      t.passengerid,
      p.model_weight,
      tree_predict(p.model_id, p.model, t.features, "-classification") as predicted
      -- tree_predict(p.model_id, p.model, t.features, ${classification}) as predicted
      -- tree_predict_v1(p.model_id, p.model_type, p.pred_model, t.features, ${classification}) as predicted -- to use the old model in v0.5.0 or later
    FROM (
      SELECT 
        model_id, model_weight, model
      FROM 
        model_rf_07
      DISTRIBUTE BY rand(1)
    ) p
    LEFT OUTER JOIN test_rf_03 t
  ) t1
  group by
    passengerid
) t2;
WITH rf_submit_03 as (
  select 
    t.survived as actual, 
    p.label as predicted
  from 
    test_rf_03 t 
    JOIN predicted_rf_03 p on (t.passengerid = p.passengerid)
)
select sum(if(actual=predicted,1,0))/count(1) as accuracy 
from rf_submit_03;

0.8153846153846154

Tracing predictions

Find important attributes and conditions predicted to survive.

WITH tmp as (
  SELECT
    t.survived as actual,
    decision_path(m.model_id, m.model, t.features, '-classification -no_verbose', array('pclass','name','sex','age','sibsp','parch','ticket','fare','cabin','embarked')) as path
  FROM
    model_rf_07 m
    LEFT OUTER JOIN -- CROSS JOIN
    test_rf_03 t
)
select
  r.branch,
  count(1) as cnt
from
  tmp l
  LATERAL VIEW explode(array_slice(path, 0, -1)) r as branch
where
  -- actual = 1 and      -- actual is survived
  last_element(path) = 1 -- predicted is survived
group by
  r.branch
order by
  cnt desc
limit 100;
r.branchcnt
sex != 0.029786
pclass != 3.018520
pclass = 3.07444
sex = 0.06494
embarked != 1.06175
ticket != 22.05560
......