docs/gitbook/multiclass/iris_randomforest.md - incubator-hivemall - Git at Google

 <!--
   Licensed to the Apache Software Foundation (ASF) under one
   or more contributor license agreements.  See the NOTICE file
   distributed with this work for additional information
   regarding copyright ownership.  The ASF licenses this file
   to you under the Apache License, Version 2.0 (the
   "License"); you may not use this file except in compliance
   with the License.  You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing,
   software distributed under the License is distributed on an
   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
   KIND, either express or implied.  See the License for the
   specific language governing permissions and limitations
   under the License.
 -->

 <!-- toc -->

 # Dataset

 * https://archive.ics.uci.edu/ml/datasets/Iris

 ```
 Attribute Information:
    1. sepal length in cm
    2. sepal width in cm
    3. petal length in cm
    4. petal width in cm
    5. class:
       -- Iris Setosa
       -- Iris Versicolour
       -- Iris Virginica
 ```

 # Table preparation

 ```sql
 create database iris;
 use iris;

 create external table raw (
   sepal_length int,
   sepal_width int,
   petal_length int,
   petak_width int,
   class string
 )
 ROW FORMAT DELIMITED
   FIELDS TERMINATED BY ','
   LINES TERMINATED BY '\n'
 STORED AS TEXTFILE LOCATION '/dataset/iris/raw';

 $ sed '/^$/d' iris.data | hadoop fs -put - /dataset/iris/raw/iris.data
 ```

 ```sql
 create table label_mapping
 as
 select
   class,
   rank - 1 as label
 from (
 select
   distinct class,
   dense_rank() over (order by class) as rank
 from
   raw
 ) t
 ;
 ```

 ```sql
 create table training
 as
 select
   rowid() as rowid,
   array(t1.sepal_length, t1.sepal_width, t1.petal_length, t1.petak_width) as features,
   t2.label
 from
   raw t1
   JOIN label_mapping t2 ON (t1.class = t2.class)
 ;
 ```

 # Training

 `train_randomforest_classifier` takes a dense `features` in double[] and a `label` starting from 0.

 ```sql
 CREATE TABLE model
   STORED AS SEQUENCEFILE
 AS
 select
   train_randomforest_classifier(features, label)
   -- hivemall v0.4.1-alpha.2 and before
   -- train_randomforest_classifier(features, label) as (pred_model, var_importance, oob_errors, oob_tests)
   -- hivemall v0.4.1 and later
   -- train_randomforest_classifier(features, label) as (model_id, model_type, pred_model, var_importance, oob_errors, oob_tests)
 from
   training;
 ```

 > #### Caution
 > The default `TEXTFILE` should not be used for model table when using Javascript output through `-output javascript` option.

 ```sql
 hive> desc extended model;
 ```

 | col_name | data_type
 |:-:|:-:|
 | model_id               | string |
 | model_weight           | double |
 | model                  | string |
 | var_importance         | array<double> |
 | oob_errors             | int    |
 | oob_tests              | int    |


 ## Training options

 `-help` option shows usage of the function.

 ```sql
 select train_randomforest_classifier(features, label, "-help") from training;

 > FAILED: UDFArgumentException
 usage: train_randomforest_classifier(array<double|string> features, int
        label [, const array<double> classWeights, const string options]) -
        Returns a relation consists of <int model_id, int model_type,
        string pred_model, array<double> var_importance, int oob_errors,
        int oob_tests, double weight> [-attrs <arg>] [-depth <arg>] [-help]
        [-leafs <arg>] [-min_samples_leaf <arg>] [-rule <arg>] [-seed
        <arg>] [-splits <arg>] [-stratified] [-subsample <arg>] [-trees
        <arg>] [-vars <arg>]
  -attrs,--attribute_types <arg>      Comma separated attribute types (Q
                                      for quantitative variable and C for
                                      categorical variable. e.g.,
                                      [Q,C,Q,C])
  -depth,--max_depth <arg>            The maximum number of the tree depth
                                      [default: Integer.MAX_VALUE]
  -help                               Show function help
  -leafs,--max_leaf_nodes <arg>       The maximum number of leaf nodes
                                      [default: Integer.MAX_VALUE]
  -min_samples_leaf <arg>             The minimum number of samples in a
                                      leaf node [default: 1]
  -rule,--split_rule <arg>            Split algorithm [default: GINI,
                                      ENTROPY]
  -seed <arg>                         seed value in long [default: -1
                                      (random)]
  -splits,--min_split <arg>           A node that has greater than or
                                      equals to `min_split` examples will
                                      split [default: 2]
  -stratified,--stratified_sampling   Enable Stratified sampling for
                                      unbalanced data
  -subsample <arg>                    Sampling rate in range (0.0,1.0]
  -trees,--num_trees <arg>            The number of trees for each task
                                      [default: 50]
  -vars,--num_variables <arg>         The number of random selected
                                      features [default:
                                      ceil(sqrt(x[0].length))].
                                      int(num_variables * x[0].length) is
                                      considered if num_variable is (0,1
 ```

 > #### Caution
 > `-num_trees` controls the number of trees for each task, not the total number of trees.

 ### Parallelize Training

 To parallelize RandomForest training, you can use UNION ALL as follows:

 ```sql
 CREATE TABLE model
   STORED AS ORC tblproperties("orc.compress"="SNAPPY")
   -- STORED AS SEQUENCEFILE
 AS
 select
   train_randomforest_classifier(features, label, '-trees 25')
 from
   training
 UNION ALL
 select
   train_randomforest_classifier(features, label, '-trees 25')
 from
   training
 ;
 ```

 ### Learning stats

 [`Variable importance`](https://www.stat.berkeley.edu/~breiman/RandomForests/cc_home.htm#varimp) and [`Out Of Bag (OOB) error rate`](https://www.stat.berkeley.edu/~breiman/RandomForests/cc_home.htm#ooberr) of RandomForest can be shown as follows:

 ```sql
 select
   array_sum(var_importance) as var_importance,
   sum(oob_errors) / sum(oob_tests) as oob_err_rate
 from
   model;
 ```
 > [6.837674865013268,4.1317115752776665,24.331571871930226,25.677497925673062]    0.056666666666666664

 # Prediction

 ```sql
 -- set hivevar:classification=true;
 set hive.auto.convert.join=true;
 set hive.mapjoin.optimized.hashtable=false;

 create table predicted
 as
 SELECT
   rowid,
   -- rf_ensemble(predicted) as predicted
   -- hivemall v0.5-rc.1 or later
   rf_ensemble(predicted.value, predicted.posteriori, model_weight) as predicted
   -- rf_ensemble(predicted.value, predicted.posteriori) as predicted -- avoid OOB accuracy (i.e., model_weight)
 FROM (
   SELECT
     rowid,
     -- hivemall v0.4.1 and later
     -- tree_predict(p.model_id, p.model_type, p.pred_model, t.features, ${classification}) as predicted
     -- hivemall v0.5-rc.1 or later
     p.model_weight,
     tree_predict(p.model_id, p.model, t.features, "-classification") as predicted
     -- tree_predict(p.model_id, p.model, t.features, ${classification}) as predicted
     -- tree_predict_v1(p.model_id, p.model_type, p.pred_model, t.features, ${classification}) as predicted -- to use the old model in v0.5-rc.1 or later
   FROM
     model p
     LEFT OUTER JOIN -- CROSS JOIN
     training t
 ) t1
 group by
   rowid
 ;
 ```

 > #### Caution
 > `tree_predict_v1` is for the backward compatibility for using prediction models built before `v0.5-rc.1` on `v0.5-rc.1` or later.

 ### Parallelize Prediction

 The following query runs predictions in N-parallel. It would reduce elapsed time for prediction almost by N.

 ```sql
 SET hivevar:classification=true;
 set hive.auto.convert.join=true;
 SET hive.mapjoin.optimized.hashtable=false;
 SET mapred.reduce.tasks=8;

 create table predicted
 as
 SELECT
   rowid,
   -- rf_ensemble(predicted) as predicted
   -- hivemall v0.5-rc.1 or later
   rf_ensemble(predicted.value, predicted.posteriori, model_weight) as predicted
   -- rf_ensemble(predicted.value, predicted.posteriori) as predicted -- avoid OOB accuracy (i.e., model_weight)
 FROM (
   SELECT
     t.rowid,
     -- hivemall v0.4.1 and later
     -- tree_predict(p.model_id, p.model_type, p.pred_model, t.features, ${classification}) as predicted
     -- hivemall v0.5-rc.1 or later
     p.model_weight,
     tree_predict(p.model_id, p.model, t.features, "-classification") as predicted
     -- tree_predict(p.model_id, p.model, t.features, ${classification}) as predicted
     -- tree_predict_v1(p.model_id, p.model_type, p.pred_model, t.features, ${classification}) as predicted as predicted -- to use the old model in v0.5-rc.1 or later
   FROM (
     SELECT
       -- hivemall v0.4.1 and later
       -- model_id, model_type, pred_model
       -- hivemall v0.5-rc.1 or later
       model_id, model_weight, model
     FROM model
     DISTRIBUTE BY rand(1)
   ) p
   LEFT OUTER JOIN training t
 ) t1
 group by
   rowid;
 ```

 # Evaluation

 ```sql
 select count(1) from training;
 ```
 > 150

 ```sql
 set hivevar:total_cnt=150;

 WITH t1 as (
 SELECT
   t.rowid,
   t.label as actual,
   p.predicted.label as predicted
 FROM
   predicted p
   LEFT OUTER JOIN training t ON (t.rowid = p.rowid)
 )
 SELECT
   count(1) / ${total_cnt}
 FROM
   t1
 WHERE
   actual = predicted
 ;
 ```
 > 0.98

 # Graphvis export

 > #### Note
 > `tree_export` feature is supported from Hivemall v0.5-rc.1 or later.
 > Better to limit tree depth on training by `-depth` option to plot a Decision Tree.

 Hivemall provide `tree_export` to export a decision tree into [Graphviz](http://www.graphviz.org/) or human-readable Javascript format. You can find the usage by issuing the following query:

 ```
 > select tree_export("","-help");

 usage: tree_export(string model, const string options, optional
        array<string> featureNames=null, optional array<string>
        classNames=null) - exports a Decision Tree model as javascript/dot]
        [-help] [-output_name <arg>] [-r] [-t <arg>]
  -help                             Show function help
  -output_name,--outputName <arg>   output name [default: predicted]
  -r,--regression                   Is regression tree or not
  -t,--type <arg>                   Type of output [default: js,
                                    javascript/js, graphvis/dot
 ```

 ```sql
 CREATE TABLE model_exported
   STORED AS ORC tblproperties("orc.compress"="SNAPPY")
 AS
 select
   model_id,
   tree_export(model, "-type javascript", array('sepal_length','sepal_width','petal_length','petak_width'), array('Setosa','Versicolour','Virginica')) as js,
   tree_export(model, "-type graphvis", array('sepal_length','sepal_width','petal_length','petak_width'), array('Setosa','Versicolour','Virginica')) as dot
 from
   model
 -- limit 1
 ;
 ```

 ```
 digraph Tree {
  node [shape=box, style="filled, rounded", color="black", fontname=helvetica];
  edge [fontname=helvetica];
  0 [label=<petal_length &le; 2.599999964237213>, fillcolor="#00000000"];
  1 [label=<predicted = Setosa>, fillcolor="0.0000,1.000,1.000", shape=ellipse];
  0 -> 1 [labeldistance=2.5, labelangle=45, headlabel="True"];
  2 [label=<petal_length &le; 4.950000047683716>, fillcolor="#00000000"];
  0 -> 2 [labeldistance=2.5, labelangle=-45, headlabel="False"];
  3 [label=<petak_width &le; 1.6500000357627869>, fillcolor="#00000000"];
  2 -> 3;
  4 [label=<predicted = Versicolour>, fillcolor="0.3333,1.000,1.000", shape=ellipse];
  3 -> 4;
  5 [label=<sepal_width &le; 3.100000023841858>, fillcolor="#00000000"];
  3 -> 5;
  6 [label=<predicted = Virginica>, fillcolor="0.6667,1.000,1.000", shape=ellipse];
  5 -> 6;
  7 [label=<predicted = Versicolour>, fillcolor="0.3333,1.000,1.000", shape=ellipse];
  5 -> 7;
  8 [label=<petak_width &le; 1.75>, fillcolor="#00000000"];
  2 -> 8;
  9 [label=<petal_length &le; 5.299999952316284>, fillcolor="#00000000"];
  8 -> 9;
  10 [label=<predicted = Versicolour>, fillcolor="0.3333,1.000,1.000", shape=ellipse];
  9 -> 10;
  11 [label=<predicted = Virginica>, fillcolor="0.6667,1.000,1.000", shape=ellipse];
  9 -> 11;
  12 [label=<predicted = Virginica>, fillcolor="0.6667,1.000,1.000", shape=ellipse];
  8 -> 12;
 }
 ```

 <img src="../resources/images/iris.png" alt="Iris Graphvis output"/>

 You can draw a graph by `dot -Tpng iris.dot -o iris.png` or using [Viz.js](http://viz-js.com/).
	<!--
	Licensed to the Apache Software Foundation (ASF) under one
	or more contributor license agreements. See the NOTICE file
	distributed with this work for additional information
	regarding copyright ownership. The ASF licenses this file
	to you under the Apache License, Version 2.0 (the
	"License"); you may not use this file except in compliance
	with the License. You may obtain a copy of the License at

	http://www.apache.org/licenses/LICENSE-2.0

	Unless required by applicable law or agreed to in writing,
	software distributed under the License is distributed on an
	"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	KIND, either express or implied. See the License for the
	specific language governing permissions and limitations
	under the License.
	-->

	<!-- toc -->

	# Dataset

	* https://archive.ics.uci.edu/ml/datasets/Iris

	```
	Attribute Information:
	1. sepal length in cm
	2. sepal width in cm
	3. petal length in cm
	4. petal width in cm
	5. class:
	-- Iris Setosa
	-- Iris Versicolour
	-- Iris Virginica
	```

	# Table preparation

	```sql
	create database iris;
	use iris;

	create external table raw (
	sepal_length int,
	sepal_width int,
	petal_length int,
	petak_width int,
	class string
	)
	ROW FORMAT DELIMITED
	FIELDS TERMINATED BY ','
	LINES TERMINATED BY '\n'
	STORED AS TEXTFILE LOCATION '/dataset/iris/raw';

	$ sed '/^$/d' iris.data \| hadoop fs -put - /dataset/iris/raw/iris.data
	```

	```sql
	create table label_mapping
	as
	select
	class,
	rank - 1 as label
	from (
	select
	distinct class,
	dense_rank() over (order by class) as rank
	from
	raw
	) t
	;
	```

	```sql
	create table training
	as
	select
	rowid() as rowid,
	array(t1.sepal_length, t1.sepal_width, t1.petal_length, t1.petak_width) as features,
	t2.label
	from
	raw t1
	JOIN label_mapping t2 ON (t1.class = t2.class)
	;
	```

	# Training

	`train_randomforest_classifier` takes a dense `features` in double[] and a `label` starting from 0.

	```sql
	CREATE TABLE model
	STORED AS SEQUENCEFILE
	AS
	select
	train_randomforest_classifier(features, label)
	-- hivemall v0.4.1-alpha.2 and before
	-- train_randomforest_classifier(features, label) as (pred_model, var_importance, oob_errors, oob_tests)
	-- hivemall v0.4.1 and later
	-- train_randomforest_classifier(features, label) as (model_id, model_type, pred_model, var_importance, oob_errors, oob_tests)
	from
	training;
	```

	> #### Caution
	> The default `TEXTFILE` should not be used for model table when using Javascript output through `-output javascript` option.

	```sql
	hive> desc extended model;
	```

	\| col_name \| data_type
	\|:-:\|:-:\|
	\| model_id \| string \|
	\| model_weight \| double \|
	\| model \| string \|
	\| var_importance \| array<double> \|
	\| oob_errors \| int \|
	\| oob_tests \| int \|


	## Training options

	`-help` option shows usage of the function.

	```sql
	select train_randomforest_classifier(features, label, "-help") from training;

	> FAILED: UDFArgumentException
	usage: train_randomforest_classifier(array<double\|string> features, int
	label [, const array<double> classWeights, const string options]) -
	Returns a relation consists of <int model_id, int model_type,
	string pred_model, array<double> var_importance, int oob_errors,
	int oob_tests, double weight> [-attrs <arg>] [-depth <arg>] [-help]
	[-leafs <arg>] [-min_samples_leaf <arg>] [-rule <arg>] [-seed
	<arg>] [-splits <arg>] [-stratified] [-subsample <arg>] [-trees
	<arg>] [-vars <arg>]
	-attrs,--attribute_types <arg> Comma separated attribute types (Q
	for quantitative variable and C for
	categorical variable. e.g.,
	[Q,C,Q,C])
	-depth,--max_depth <arg> The maximum number of the tree depth
	[default: Integer.MAX_VALUE]
	-help Show function help
	-leafs,--max_leaf_nodes <arg> The maximum number of leaf nodes
	[default: Integer.MAX_VALUE]
	-min_samples_leaf <arg> The minimum number of samples in a
	leaf node [default: 1]
	-rule,--split_rule <arg> Split algorithm [default: GINI,
	ENTROPY]
	-seed <arg> seed value in long [default: -1
	(random)]
	-splits,--min_split <arg> A node that has greater than or
	equals to `min_split` examples will
	split [default: 2]
	-stratified,--stratified_sampling Enable Stratified sampling for
	unbalanced data
	-subsample <arg> Sampling rate in range (0.0,1.0]
	-trees,--num_trees <arg> The number of trees for each task
	[default: 50]
	-vars,--num_variables <arg> The number of random selected
	features [default:
	ceil(sqrt(x[0].length))].
	int(num_variables * x[0].length) is
	considered if num_variable is (0,1
	```

	> #### Caution
	> `-num_trees` controls the number of trees for each task, not the total number of trees.

	### Parallelize Training

	To parallelize RandomForest training, you can use UNION ALL as follows:

	```sql
	CREATE TABLE model
	STORED AS ORC tblproperties("orc.compress"="SNAPPY")
	-- STORED AS SEQUENCEFILE
	AS
	select
	train_randomforest_classifier(features, label, '-trees 25')
	from
	training
	UNION ALL
	select
	train_randomforest_classifier(features, label, '-trees 25')
	from
	training
	;
	```

	### Learning stats

	[`Variable importance`](https://www.stat.berkeley.edu/~breiman/RandomForests/cc_home.htm#varimp) and [`Out Of Bag (OOB) error rate`](https://www.stat.berkeley.edu/~breiman/RandomForests/cc_home.htm#ooberr) of RandomForest can be shown as follows:

	```sql
	select
	array_sum(var_importance) as var_importance,
	sum(oob_errors) / sum(oob_tests) as oob_err_rate
	from
	model;
	```
	> [6.837674865013268,4.1317115752776665,24.331571871930226,25.677497925673062] 0.056666666666666664

	# Prediction

	```sql
	-- set hivevar:classification=true;
	set hive.auto.convert.join=true;
	set hive.mapjoin.optimized.hashtable=false;

	create table predicted
	as
	SELECT
	rowid,
	-- rf_ensemble(predicted) as predicted
	-- hivemall v0.5-rc.1 or later
	rf_ensemble(predicted.value, predicted.posteriori, model_weight) as predicted
	-- rf_ensemble(predicted.value, predicted.posteriori) as predicted -- avoid OOB accuracy (i.e., model_weight)
	FROM (
	SELECT
	rowid,
	-- hivemall v0.4.1 and later
	-- tree_predict(p.model_id, p.model_type, p.pred_model, t.features, ${classification}) as predicted
	-- hivemall v0.5-rc.1 or later
	p.model_weight,
	tree_predict(p.model_id, p.model, t.features, "-classification") as predicted
	-- tree_predict(p.model_id, p.model, t.features, ${classification}) as predicted
	-- tree_predict_v1(p.model_id, p.model_type, p.pred_model, t.features, ${classification}) as predicted -- to use the old model in v0.5-rc.1 or later
	FROM
	model p
	LEFT OUTER JOIN -- CROSS JOIN
	training t
	) t1
	group by
	rowid
	;
	```

	> #### Caution
	> `tree_predict_v1` is for the backward compatibility for using prediction models built before `v0.5-rc.1` on `v0.5-rc.1` or later.

	### Parallelize Prediction

	The following query runs predictions in N-parallel. It would reduce elapsed time for prediction almost by N.

	```sql
	SET hivevar:classification=true;
	set hive.auto.convert.join=true;
	SET hive.mapjoin.optimized.hashtable=false;
	SET mapred.reduce.tasks=8;

	create table predicted
	as
	SELECT
	rowid,
	-- rf_ensemble(predicted) as predicted
	-- hivemall v0.5-rc.1 or later
	rf_ensemble(predicted.value, predicted.posteriori, model_weight) as predicted
	-- rf_ensemble(predicted.value, predicted.posteriori) as predicted -- avoid OOB accuracy (i.e., model_weight)
	FROM (
	SELECT
	t.rowid,
	-- hivemall v0.4.1 and later
	-- tree_predict(p.model_id, p.model_type, p.pred_model, t.features, ${classification}) as predicted
	-- hivemall v0.5-rc.1 or later
	p.model_weight,
	tree_predict(p.model_id, p.model, t.features, "-classification") as predicted
	-- tree_predict(p.model_id, p.model, t.features, ${classification}) as predicted
	-- tree_predict_v1(p.model_id, p.model_type, p.pred_model, t.features, ${classification}) as predicted as predicted -- to use the old model in v0.5-rc.1 or later
	FROM (
	SELECT
	-- hivemall v0.4.1 and later
	-- model_id, model_type, pred_model
	-- hivemall v0.5-rc.1 or later
	model_id, model_weight, model
	FROM model
	DISTRIBUTE BY rand(1)
	) p
	LEFT OUTER JOIN training t
	) t1
	group by
	rowid;
	```

	# Evaluation

	```sql
	select count(1) from training;
	```
	> 150

	```sql
	set hivevar:total_cnt=150;

	WITH t1 as (
	SELECT
	t.rowid,
	t.label as actual,
	p.predicted.label as predicted
	FROM
	predicted p
	LEFT OUTER JOIN training t ON (t.rowid = p.rowid)
	)
	SELECT
	count(1) / ${total_cnt}
	FROM
	t1
	WHERE
	actual = predicted
	;
	```
	> 0.98

	# Graphvis export

	> #### Note
	> `tree_export` feature is supported from Hivemall v0.5-rc.1 or later.
	> Better to limit tree depth on training by `-depth` option to plot a Decision Tree.

	Hivemall provide `tree_export` to export a decision tree into [Graphviz](http://www.graphviz.org/) or human-readable Javascript format. You can find the usage by issuing the following query:

	```
	> select tree_export("","-help");

	usage: tree_export(string model, const string options, optional
	array<string> featureNames=null, optional array<string>
	classNames=null) - exports a Decision Tree model as javascript/dot]
	[-help] [-output_name <arg>] [-r] [-t <arg>]
	-help Show function help
	-output_name,--outputName <arg> output name [default: predicted]
	-r,--regression Is regression tree or not
	-t,--type <arg> Type of output [default: js,
	javascript/js, graphvis/dot
	```

	```sql
	CREATE TABLE model_exported
	STORED AS ORC tblproperties("orc.compress"="SNAPPY")
	AS
	select
	model_id,
	tree_export(model, "-type javascript", array('sepal_length','sepal_width','petal_length','petak_width'), array('Setosa','Versicolour','Virginica')) as js,
	tree_export(model, "-type graphvis", array('sepal_length','sepal_width','petal_length','petak_width'), array('Setosa','Versicolour','Virginica')) as dot
	from
	model
	-- limit 1
	;
	```

	```
	digraph Tree {
	node [shape=box, style="filled, rounded", color="black", fontname=helvetica];
	edge [fontname=helvetica];
	0 [label=<petal_length ≤ 2.599999964237213>, fillcolor="#00000000"];
	1 [label=<predicted = Setosa>, fillcolor="0.0000,1.000,1.000", shape=ellipse];
	0 -> 1 [labeldistance=2.5, labelangle=45, headlabel="True"];
	2 [label=<petal_length ≤ 4.950000047683716>, fillcolor="#00000000"];
	0 -> 2 [labeldistance=2.5, labelangle=-45, headlabel="False"];
	3 [label=<petak_width ≤ 1.6500000357627869>, fillcolor="#00000000"];
	2 -> 3;
	4 [label=<predicted = Versicolour>, fillcolor="0.3333,1.000,1.000", shape=ellipse];
	3 -> 4;
	5 [label=<sepal_width ≤ 3.100000023841858>, fillcolor="#00000000"];
	3 -> 5;
	6 [label=<predicted = Virginica>, fillcolor="0.6667,1.000,1.000", shape=ellipse];
	5 -> 6;
	7 [label=<predicted = Versicolour>, fillcolor="0.3333,1.000,1.000", shape=ellipse];
	5 -> 7;
	8 [label=<petak_width ≤ 1.75>, fillcolor="#00000000"];
	2 -> 8;
	9 [label=<petal_length ≤ 5.299999952316284>, fillcolor="#00000000"];
	8 -> 9;
	10 [label=<predicted = Versicolour>, fillcolor="0.3333,1.000,1.000", shape=ellipse];
	9 -> 10;
	11 [label=<predicted = Virginica>, fillcolor="0.6667,1.000,1.000", shape=ellipse];
	9 -> 11;
	12 [label=<predicted = Virginica>, fillcolor="0.6667,1.000,1.000", shape=ellipse];
	8 -> 12;
	}
	```

	<img src="../resources/images/iris.png" alt="Iris Graphvis output"/>

	You can draw a graph by `dot -Tpng iris.dot -o iris.png` or using [Viz.js](http://viz-js.com/).