Attribute Information: 1. sepal length in cm 2. sepal width in cm 3. petal length in cm 4. petal width in cm 5. class: -- Iris Setosa -- Iris Versicolour -- Iris Virginica
create database iris; use iris; create external table raw ( sepal_length int, sepal_width int, petal_length int, petak_width int, class string ) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' STORED AS TEXTFILE LOCATION '/dataset/iris/raw'; $ sed '/^$/d' iris.data | hadoop fs -put - /dataset/iris/raw/iris.data
create table label_mapping as select class, rank - 1 as label from ( select distinct class, dense_rank() over (order by class) as rank from raw ) t ;
create table training as select rowid() as rowid, array(t1.sepal_length, t1.sepal_width, t1.petal_length, t1.petak_width) as features, t2.label from raw t1 JOIN label_mapping t2 ON (t1.class = t2.class) ;
train_randomforest_classifier
takes a dense features
in double[] and a label
starting from 0.
CREATE TABLE model STORED AS SEQUENCEFILE AS select train_randomforest_classifier(features, label) -- v0.5.0 and later -- train_randomforest_classifier(features, label) as (model_id, model_weight, model, var_importance, oob_errors, oob_tests) -- v0.4.1-alpha.2 and before -- train_randomforest_classifier(features, label) as (pred_model, var_importance, oob_errors, oob_tests) -- from v0.4.1 to v0.4.2-rc4 -- train_randomforest_classifier(features, label) as (model_id, model_type, pred_model, var_importance, oob_errors, oob_tests) from training;
Caution
Note that model storage format is different between versions as seen the above.
hive> desc extended model;
col_name | data_type |
---|---|
model_id | string |
model_weight | double |
model | string |
var_importance | array |
oob_errors | int |
oob_tests | int |
-help
option shows usage of the function.
select train_randomforest_classifier(features, label, "-help") from training; > FAILED: UDFArgumentException usage: train_randomforest_classifier(array<double|string> features, int label [, const array<double> classWeights, const string options]) - Returns a relation consists of <int model_id, int model_type, string pred_model, array<double> var_importance, int oob_errors, int oob_tests, double weight> [-attrs <arg>] [-depth <arg>] [-help] [-leafs <arg>] [-min_samples_leaf <arg>] [-rule <arg>] [-seed <arg>] [-splits <arg>] [-stratified] [-subsample <arg>] [-trees <arg>] [-vars <arg>] -attrs,--attribute_types <arg> Comma separated attribute types (Q for quantitative variable and C for categorical variable. e.g., [Q,C,Q,C]) -depth,--max_depth <arg> The maximum number of the tree depth [default: Integer.MAX_VALUE] -help Show function help -leafs,--max_leaf_nodes <arg> The maximum number of leaf nodes [default: Integer.MAX_VALUE] -min_samples_leaf <arg> The minimum number of samples in a leaf node [default: 1] -rule,--split_rule <arg> Split algorithm [default: GINI, ENTROPY] -seed <arg> seed value in long [default: -1 (random)] -splits,--min_split <arg> A node that has greater than or equals to `min_split` examples will split [default: 2] -stratified,--stratified_sampling Enable Stratified sampling for unbalanced data -subsample <arg> Sampling rate in range (0.0,1.0] -trees,--num_trees <arg> The number of trees for each task [default: 50] -vars,--num_variables <arg> The number of random selected features [default: ceil(sqrt(x[0].length))]. int(num_variables * x[0].length) is considered if num_variable is (0,1]
Caution
-num_trees
controls the number of trees for each task, not the total number of trees.
To parallelize RandomForest training, you can use UNION ALL as follows:
CREATE TABLE model STORED AS ORC tblproperties("orc.compress"="SNAPPY") -- STORED AS SEQUENCEFILE AS select train_randomforest_classifier(features, label, '-trees 25') from training UNION ALL select train_randomforest_classifier(features, label, '-trees 25') from training ;
Variable importance
and Out Of Bag (OOB) error rate
of RandomForest can be shown as follows:
select array_sum(var_importance) as var_importance, sum(oob_errors) / sum(oob_tests) as oob_err_rate from model;
[6.837674865013268,4.1317115752776665,24.331571871930226,25.677497925673062] 0.056666666666666664
-- set hivevar:classification=true; set hive.auto.convert.join=true; set hive.mapjoin.optimized.hashtable=false; create table predicted as SELECT rowid, -- rf_ensemble(predicted) as predicted -- v0.5.0 or later rf_ensemble(predicted.value, predicted.posteriori, model_weight) as predicted -- rf_ensemble(predicted.value, predicted.posteriori) as predicted -- avoid OOB accuracy (i.e., model_weight) FROM ( SELECT rowid, -- from v0.4.1 to v0.4.2-rc4 -- tree_predict(p.model_id, p.model_type, p.pred_model, t.features, ${classification}) as predicted -- v0.5.0 or later p.model_weight, tree_predict(p.model_id, p.model, t.features, "-classification") as predicted -- tree_predict(p.model_id, p.model, t.features, ${classification}) as predicted -- tree_predict_v1(p.model_id, p.model_type, p.pred_model, t.features, ${classification}) as predicted -- to use the old model in v0.5.0 or later FROM model p LEFT OUTER JOIN -- CROSS JOIN training t ) t1 group by rowid ;
Note
Left outer join without a join condition (i.e.,
model p LEFT OUTER JOIN training t
) is a trick to fix the left table for cross join.
Caution
tree_predict_v1
is for the backward compatibility for using prediction models built beforev0.5
onv0.5
or later.
The following query runs predictions in N-parallel. It would reduce elapsed time for prediction almost by N.
SET hivevar:classification=true; set hive.auto.convert.join=true; SET hive.mapjoin.optimized.hashtable=false; SET mapred.reduce.tasks=8; drop table predicted; create table predicted as SELECT rowid, -- rf_ensemble(predicted) as predicted -- v0.5.0 or later rf_ensemble(predicted.value, predicted.posteriori, model_weight) as predicted -- rf_ensemble(predicted.value, predicted.posteriori) as predicted -- avoid OOB accuracy (i.e., model_weight) FROM ( SELECT t.rowid, -- from v0.4.1 to v0.4.2-rc4 -- tree_predict(p.model_id, p.model_type, p.pred_model, t.features, ${classification}) as predicted -- v0.5.0 or later p.model_weight, tree_predict(p.model_id, p.model, t.features, "-classification") as predicted -- tree_predict(p.model_id, p.model, t.features, ${classification}) as predicted -- tree_predict_v1(p.model_id, p.model_type, p.pred_model, t.features, ${classification}) as predicted as predicted -- to use the old model in v0.5.0 or later FROM ( SELECT -- from v0.4.1 to v0.4.2-rc4 -- model_id, model_type, pred_model -- v0.5.0 or later model_id, model_weight, model FROM model DISTRIBUTE BY rand(1) ) p LEFT OUTER JOIN training t ) t1 group by rowid;
select count(1) from training;
150
set hivevar:total_cnt=150; WITH t1 as ( SELECT t.rowid, t.label as actual, p.predicted.label as predicted FROM predicted p LEFT OUTER JOIN training t ON (t.rowid = p.rowid) ) SELECT count(1) / ${total_cnt} FROM t1 WHERE actual = predicted ;
0.98
Note
tree_export
feature is supported from Hivemall v0.5.0 or later. Better to limit tree depth on training by-depth
option to plot a Decision Tree.
Hivemall provide tree_export
to export a decision tree into Graphviz or human-readable Javascript format. You can find the usage by issuing the following query:
> select tree_export("","-help"); usage: tree_export(string model, const string options, optional array<string> featureNames=null, optional array<string> classNames=null) - exports a Decision Tree model as javascript/dot] [-help] [-output_name <arg>] [-r] [-t <arg>] -help Show function help -output_name,--outputName <arg> output name [default: predicted] -r,--regression Is regression tree or not -t,--type <arg> Type of output [default: js, javascript/js, graphviz/dot
CREATE TABLE model_exported STORED AS ORC tblproperties("orc.compress"="SNAPPY") AS select model_id, tree_export(model, "-type javascript", array('sepal_length','sepal_width','petal_length','petak_width'), array('Setosa','Versicolour','Virginica')) as js, tree_export(model, "-type graphviz", array('sepal_length','sepal_width','petal_length','petak_width'), array('Setosa','Versicolour','Virginica')) as dot from model -- limit 1 ;
digraph Tree { node [shape=box, style="filled, rounded", color="black", fontname=helvetica]; edge [fontname=helvetica]; 0 [label=<petal_length ≤ 2.599999964237213>, fillcolor="#00000000"]; 1 [label=<predicted = Setosa>, fillcolor="0.0000,1.000,1.000", shape=ellipse]; 0 -> 1 [labeldistance=2.5, labelangle=45, headlabel="True"]; 2 [label=<petal_length ≤ 4.950000047683716>, fillcolor="#00000000"]; 0 -> 2 [labeldistance=2.5, labelangle=-45, headlabel="False"]; 3 [label=<petak_width ≤ 1.6500000357627869>, fillcolor="#00000000"]; 2 -> 3; 4 [label=<predicted = Versicolour>, fillcolor="0.3333,1.000,1.000", shape=ellipse]; 3 -> 4; 5 [label=<sepal_width ≤ 3.100000023841858>, fillcolor="#00000000"]; 3 -> 5; 6 [label=<predicted = Virginica>, fillcolor="0.6667,1.000,1.000", shape=ellipse]; 5 -> 6; 7 [label=<predicted = Versicolour>, fillcolor="0.3333,1.000,1.000", shape=ellipse]; 5 -> 7; 8 [label=<petak_width ≤ 1.75>, fillcolor="#00000000"]; 2 -> 8; 9 [label=<petal_length ≤ 5.299999952316284>, fillcolor="#00000000"]; 8 -> 9; 10 [label=<predicted = Versicolour>, fillcolor="0.3333,1.000,1.000", shape=ellipse]; 9 -> 10; 11 [label=<predicted = Virginica>, fillcolor="0.6667,1.000,1.000", shape=ellipse]; 9 -> 11; 12 [label=<predicted = Virginica>, fillcolor="0.6667,1.000,1.000", shape=ellipse]; 8 -> 12; }
You can draw a graph by dot -Tpng iris.dot -o iris.png
or using Viz.js.