This examples gives a basic usage of RandomForest on Hivemall using Kaggle Titanic dataset. The example gives a baseline score without any feature engineering.
create database titanic; use titanic; drop table train; create external table train ( passengerid int, -- unique id survived int, -- target label pclass int, name string, sex string, age int, sibsp int, -- Number of Siblings/Spouses Aboard parch int, -- Number of Parents/Children Aboard ticket string, fare double, cabin string, embarked string ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' LINES TERMINATED BY '\n' STORED AS TEXTFILE LOCATION '/dataset/titanic/train';
hadoop fs -rm /dataset/titanic/train/train.csv awk '{ FPAT="([^,]*)|(\"[^\"]+\")";OFS="|"; } NR >1 {$1=$1;$4=substr($4,2,length($4)-2);print $0}' train.csv | hadoop fs -put - /dataset/titanic/train/train.csv
drop table test_raw; create external table test_raw ( passengerid int, pclass int, name string, sex string, age int, sibsp int, -- Number of Siblings/Spouses Aboard parch int, -- Number of Parents/Children Aboard ticket string, fare double, cabin string, embarked string ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' LINES TERMINATED BY '\n' STORED AS TEXTFILE LOCATION '/dataset/titanic/test_raw';
hadoop fs -rm /dataset/titanic/test_raw/test.csv awk '{ FPAT="([^,]*)|(\"[^\"]+\")";OFS="|"; } NR >1 {$1=$1;$3=substr($3,2,length($3)-2);print $0}' test.csv | hadoop fs -put - /dataset/titanic/test_raw/test.csv
set hivevar:output_row=true; drop table train_rf; create table train_rf as WITH train_quantified as ( select quantify( ${output_row}, passengerid, survived, pclass, name, sex, age, sibsp, parch, ticket, fare, cabin, embarked ) as (passengerid, survived, pclass, name, sex, age, sibsp, parch, ticket, fare, cabin, embarked) from ( select * from train order by passengerid asc ) t ) select rand(31) as rnd, passengerid, array(pclass, name, sex, age, sibsp, parch, ticket, fare, cabin, embarked) as features, survived from train_quantified ; drop table test_rf; create table test_rf as WITH test_quantified as ( select quantify( output_row, passengerid, pclass, name, sex, age, sibsp, parch, ticket, fare, cabin, embarked ) as (passengerid, pclass, name, sex, age, sibsp, parch, ticket, fare, cabin, embarked) from ( -- need training data to assign consistent ids to categorical variables select * from ( select 1 as train_first, false as output_row, passengerid, pclass, name, sex, age, sibsp, parch, ticket, fare, cabin, embarked from train union all select 2 as train_first, true as output_row, passengerid, pclass, name, sex, age, sibsp, parch, ticket, fare, cabin, embarked from test_raw ) t0 order by train_first asc, passengerid asc ) t1 ) select passengerid, array(pclass, name, sex, age, sibsp, parch, ticket, fare, cabin, embarked) as features from test_quantified ;
select guess_attribute_types(pclass, name, sex, age, sibsp, parch, ticket, fare, cabin, embarked) from train limit 1;
Q,C,C,Q,Q,Q,C,Q,C,C
Q
and C
represent quantitative variable and categorical variables, respectively.
Caution
Note that the output of
guess_attribute_types
is not perfect. Revise it by your self. For example,pclass
is a categorical variable.
set hivevar:attrs=C,C,C,Q,Q,Q,C,Q,C,C; drop table model_rf; create table model_rf AS select train_randomforest_classifier(features, survived, "-trees 500 -attrs ${attrs}") from train_rf ; select array_sum(var_importance) as var_importance, sum(oob_errors) / sum(oob_tests) as oob_err_rate from model_rf;
[137.00242639169272,1194.2140119834373,328.78017188176966,628.2568660509628,200.31275032394072,160.12876797647078,1083.5987543408116,664.1234312561456,422.89449844090393,130.72019667694784] 0.18742985409652077
-- SET hivevar:classification=true; set hive.auto.convert.join=true; SET hive.mapjoin.optimized.hashtable=false; SET mapred.reduce.tasks=16; drop table predicted_rf; create table predicted_rf as SELECT passengerid, predicted.label, predicted.probability, predicted.probabilities FROM ( SELECT passengerid, rf_ensemble(predicted.value, predicted.posteriori, model_weight) as predicted -- rf_ensemble(predicted.value, predicted.posteriori) as predicted -- avoid OOB accuracy (i.e., model_weight) FROM ( SELECT t.passengerid, p.model_weight, tree_predict(p.model_id, p.model, t.features, "-classification") as predicted -- tree_predict_v1(p.model_id, p.model_type, p.pred_model, t.features, ${classification}) as predicted -- to use the old model in v0.5.0 or later FROM ( SELECT model_id, model_weight, model FROM model_rf DISTRIBUTE BY rand(1) ) p LEFT OUTER JOIN test_rf t ) t1 group by passengerid ) t2 ;
Caution
tree_predict_v1
is for the backward compatibility for using prediction models built beforev0.5.0
onv0.5.0
or later.
drop table predicted_rf_submit; create table predicted_rf_submit ROW FORMAT DELIMITED FIELDS TERMINATED BY "," LINES TERMINATED BY "\n" STORED AS TEXTFILE as SELECT passengerid, label as survived FROM predicted_rf ORDER BY passengerid ASC;
hadoop fs -getmerge /user/hive/warehouse/titanic.db/predicted_rf_submit predicted_rf_submit.csv sed -i -e "1i PassengerId,Survived" predicted_rf_submit.csv
Accuracy would gives 0.76555
for a Kaggle submission.
Note
tree_export
feature is supported from Hivemall v0.5.0 or later. Better to limit tree depth on training by-depth
option to plot a Decision Tree.
Hivemall provide tree_export
to export a decision tree into Graphviz or human-readable Javascript format. You can find the usage by issuing the following query:
> select tree_export("","-help"); usage: tree_export(string model, const string options, optional array<string> featureNames=null, optional array<string> classNames=null) - exports a Decision Tree model as javascript/dot] [-help] [-output_name <arg>] [-r] [-t <arg>] -help Show function help -output_name,--outputName <arg> output name [default: predicted] -r,--regression Is regression tree or not -t,--type <arg> Type of output [default: js, javascript/js, graphviz/dot
CREATE TABLE model_exported STORED AS ORC tblproperties("orc.compress"="SNAPPY") AS select model_id, tree_export(model, "-type javascript -output_name survived", array('pclass','name','sex','age','sibsp','parch','ticket','fare','cabin','embarked'), array('no','yes')) as js, tree_export(model, "-type graphviz -output_name survived", array('pclass','name','sex','age','sibsp','parch','ticket','fare','cabin','embarked'), array('no','yes')) as dot from model_rf -- limit 1 ;
Here is an example plotting a decision tree using Graphviz or Vis.js.
drop table train_rf_07; create table train_rf_07 as select * from train_rf where rnd < 0.7; drop table test_rf_03; create table test_rf_03 as select * from train_rf where rnd >= 0.7; drop table model_rf_07; create table model_rf_07 AS select train_randomforest_classifier(features, survived, "-trees 500 -attrs ${attrs}") from train_rf_07; select array_sum(var_importance) as var_importance, sum(oob_errors) / sum(oob_tests) as oob_err_rate from model_rf_07;
[116.12055542977338,960.8569891444097,291.08765260103837,469.74671636586226,163.721292772701,120.784769882858,847.9769298113661,554.4617571355476,346.3500941757221,97.42593940113392] 0.1838351822503962
-- SET hivevar:classification=true; SET hive.mapjoin.optimized.hashtable=false; SET mapred.reduce.tasks=16; drop table predicted_rf_03; create table predicted_rf_03 as SELECT passengerid, predicted.label, predicted.probability, predicted.probabilities FROM ( SELECT passengerid, rf_ensemble(predicted.value, predicted.posteriori, model_weight) as predicted -- rf_ensemble(predicted.value, predicted.posteriori) as predicted -- avoid OOB accuracy (i.e., model_weight) FROM ( SELECT t.passengerid, p.model_weight, tree_predict(p.model_id, p.model, t.features, "-classification") as predicted -- tree_predict(p.model_id, p.model, t.features, ${classification}) as predicted -- tree_predict_v1(p.model_id, p.model_type, p.pred_model, t.features, ${classification}) as predicted -- to use the old model in v0.5.0 or later FROM ( SELECT model_id, model_weight, model FROM model_rf_07 DISTRIBUTE BY rand(1) ) p LEFT OUTER JOIN test_rf_03 t ) t1 group by passengerid ) t2;
WITH rf_submit_03 as ( select t.survived as actual, p.label as predicted from test_rf_03 t JOIN predicted_rf_03 p on (t.passengerid = p.passengerid) ) select sum(if(actual=predicted,1,0))/count(1) as accuracy from rf_submit_03;
0.8153846153846154
Find important attributes and conditions predicted to survive.
WITH tmp as ( SELECT t.survived as actual, decision_path(m.model_id, m.model, t.features, '-classification -no_verbose', array('pclass','name','sex','age','sibsp','parch','ticket','fare','cabin','embarked')) as path FROM model_rf_07 m LEFT OUTER JOIN -- CROSS JOIN test_rf_03 t ) select r.branch, count(1) as cnt from tmp l LATERAL VIEW explode(array_slice(path, 0, -1)) r as branch where -- actual = 1 and -- actual is survived last_element(path) = 1 -- predicted is survived group by r.branch order by cnt desc limit 100;
r.branch | cnt |
---|---|
sex != 0.0 | 29786 |
pclass != 3.0 | 18520 |
pclass = 3.0 | 7444 |
sex = 0.0 | 6494 |
embarked != 1.0 | 6175 |
ticket != 22.0 | 5560 |
... | ... |