blob: f493d617ec3d9dead846a8b57b1beca495d51dfb [file] [log] [blame]
<!-- HTML header for doxygen 1.8.4-->
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
<meta http-equiv="X-UA-Compatible" content="IE=9"/>
<meta name="generator" content="Doxygen 1.8.4"/>
<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data mining,deep learning,ensemble methods,data science,market basket analysis,affinity analysis,pca,lda,regression,elastic net,huber white,proportional hazards,k-means,latent dirichlet allocation,bayes,support vector machines,svm"/>
<title>MADlib: Decision Tree</title>
<link href="tabs.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript" src="jquery.js"></script>
<script type="text/javascript" src="dynsections.js"></script>
<link href="navtree.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript" src="resize.js"></script>
<script type="text/javascript" src="navtree.js"></script>
<script type="text/javascript">
$(document).ready(initResizable);
$(window).load(resizeHeight);
</script>
<link href="search/search.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript" src="search/search.js"></script>
<script type="text/javascript">
$(document).ready(function() { searchBox.OnSelectItem(0); });
</script>
<script type="text/x-mathjax-config">
MathJax.Hub.Config({
extensions: ["tex2jax.js", "TeX/AMSmath.js", "TeX/AMSsymbols.js"],
jax: ["input/TeX","output/HTML-CSS"],
});
</script><script src="../mathjax/MathJax.js"></script>
<link href="doxygen.css" rel="stylesheet" type="text/css" />
<link href="madlib_extra.css" rel="stylesheet" type="text/css"/>
<!-- google analytics -->
<script>
(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
})(window,document,'script','//www.google-analytics.com/analytics.js','ga');
ga('create', 'UA-45382226-1', 'auto');
ga('send', 'pageview');
</script>
</head>
<body>
<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
<div id="titlearea">
<table cellspacing="0" cellpadding="0">
<tbody>
<tr style="height: 56px;">
<td style="padding-left: 0.5em;">
<div id="projectname">MADlib
&#160;<span id="projectnumber">1.2</span> <span style="font-size:10pt; font-style:italic"><a href="../latest/./group__grp__dectree.html"> A newer version is available</a></span>
</div>
<div id="projectbrief">User Documentation</div>
</td>
<!--BEGIN VERSIONS LINKS-->
<td style="padding-left: 0.5em;">
<div class="versionlist"><ul>
<li class="head">More versions:</li>
<li><a href="../v1.1/index.html">v1.1</li>
<li><a href="../v1.0/index.html">v1.0</li>
<li><a href="../v0.7/index.html">v0.7</li>
<li><a href="../v0.5/index.html">v0.5</li></ul>
</div>
</td>
<td> <div id="MSearchBox" class="MSearchBoxInactive">
<span class="left">
<img id="MSearchSelect" src="search/mag_sel.png"
onmouseover="return searchBox.OnSearchSelectShow()"
onmouseout="return searchBox.OnSearchSelectHide()"
alt=""/>
<input type="text" id="MSearchField" value="Search" accesskey="S"
onfocus="searchBox.OnSearchFieldFocus(true)"
onblur="searchBox.OnSearchFieldFocus(false)"
onkeyup="searchBox.OnSearchFieldChange(event)"/>
</span><span class="right">
<a id="MSearchClose" href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" border="0" src="search/close.png" alt=""/></a>
</span>
</div>
</td>
</tr>
</tbody>
</table>
</div>
<!-- end header part -->
<!-- Generated by Doxygen 1.8.4 -->
<script type="text/javascript">
var searchBox = new SearchBox("searchBox", "search",false,'Search');
</script>
</div><!-- top -->
<div id="side-nav" class="ui-resizable side-nav-resizable">
<div id="nav-tree">
<div id="nav-tree-contents">
<div id="nav-sync" class="sync"></div>
</div>
</div>
<div id="splitbar" style="-moz-user-select:none;"
class="ui-resizable-handle">
</div>
</div>
<script type="text/javascript">
$(document).ready(function(){initNavTree('group__grp__dectree.html','');});
</script>
<div id="doc-content">
<!-- window showing the filter options -->
<div id="MSearchSelectWindow"
onmouseover="return searchBox.OnSearchSelectShow()"
onmouseout="return searchBox.OnSearchSelectHide()"
onkeydown="return searchBox.OnSearchSelectKey(event)">
<a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(0)"><span class="SelectionMark">&#160;</span>All</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(1)"><span class="SelectionMark">&#160;</span>Files</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(2)"><span class="SelectionMark">&#160;</span>Functions</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(3)"><span class="SelectionMark">&#160;</span>Variables</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(4)"><span class="SelectionMark">&#160;</span>Groups</a></div>
<!-- iframe showing the search results (closed by default) -->
<div id="MSearchResultsWindow">
<iframe src="javascript:void(0)" frameborder="0"
name="MSearchResults" id="MSearchResults">
</iframe>
</div>
<div class="header">
<div class="headertitle">
<div class="title">Decision Tree<div class="ingroups"><a class="el" href="group__grp__early__stage.html">Early Stage Development</a></div></div> </div>
</div><!--header-->
<div class="contents">
<dl class="section warning"><dt>Warning</dt><dd><em> This MADlib method is still in early stage development. There may be some issues that will be addressed in a future version. Interface and implementation is subject to change. </em></dd></dl>
<dl class="section user"><dt>About</dt><dd></dd></dl>
<p>This module provides an implementation of the C4.5 implementation to grow decision trees.</p>
<p>The implementation supports:</p>
<ul>
<li>Building decision tree</li>
<li>Multiple split critera, including: . Information Gain . Gini Coefficient . Gain Ratio</li>
<li>Decision tree Pruning</li>
<li>Decision tree classification/scoring</li>
<li>Decision tree display</li>
<li>Rule generation</li>
<li>Continuous and discrete features</li>
<li>Missing value handling</li>
</ul>
<dl class="section user"><dt>Input</dt><dd></dd></dl>
<p>The <b>training data</b> is expected to be of the following form: </p>
<pre>{TABLE|VIEW} <em>trainingSource</em> (
...
<em>id</em> INT|BIGINT,
<em>feature1</em> SUPPORTED_DATA_TYPE,
<em>feature2</em> SUPPORTED_DATA_TYPE,
<em>feature3</em> SUPPORTED_DATA_TYPE,
....................
<em>featureN</em> SUPPORTED_DATA_TYPE,
<em>class</em> SUPPORTED_DATA_TYPE,
...
)</pre><p>The detailed list of SUPPORTED_DATA_TYPE is: SMALLINT, INT, BIGINT, FLOAT8, REAL, DECIMAL, INET, CIDR, MACADDR, BOOLEAN, CHAR, VARCHAR, TEXT, "char", DATE, TIME, TIMETZ, TIMESTAMP, TIMESTAMPTZ, and INTERVAL.</p>
<p>The <b>data to classify</b> is expected to be of the same form as <b>training data</b>, except that it does not need a class column.</p>
<dl class="section user"><dt>Usage</dt><dd><ul>
<li>Run the training algorithm on the source data: <pre>SELECT * FROM <a class="el" href="c45_8sql__in.html#a18b30ff1a063e7cd16274bf7ab2a71dc">c45_train</a>(
'<em>split_criterion</em>',
'<em>training_table_name</em>',
'<em>result_tree_table_name</em>',
'<em>validation_table_name</em>',
'<em>continuous_feature_names</em>',
'<em>feature_col_names</em>',
'<em>id_col_name</em>',
'<em>class_col_name</em>',
'<em>confidence_level</em>',
'<em>how2handle_missing_value</em>'
'<em>max_tree_depth</em>',
'<em>node_prune_threshold</em>',
'<em>node_split_threshold</em>'
'<em>verbosity</em>');
</pre> This will create the decision tree output table storing an abstract object (representing the model) used for further classification. Column names: <pre>
id | tree_location | feature | probability | ebp_coeff | maxclass | scv | live | sample_size | parent_id | lmc_nid | lmc_fval | is_continuous | split_value | tid | dp_ids
----+---------------+---------+-------------------+------------------+----------+-------------------+------+-----------+-----------+---------+----------+-----------------+-------------+-----+--------
...</pre></li>
<li>Run the classification function using the learned model: <pre>SELECT * FROM <a class="el" href="c45_8sql__in.html#af5eb174eeecd11233409657221586cf1">c45_classify</a>(
'<em>tree_table_name</em>',
'<em>classification_table_name</em>',
'<em>result_table_name</em>');</pre> This will create the result_table with the classification results. <pre> </pre></li>
<li>Run the scorinf function to score the learned model against a validation data set: <pre>SELECT * FROM <a class="el" href="c45_8sql__in.html#af0739749507c1097003dcf529d29fee2">c45_score</a>(
'<em>tree_table_name</em>',
'<em>validation_table_name</em>',
'<em>verbosity</em>');</pre> This will give a ratio of correctly classified items in the validation set. <pre> </pre></li>
<li>Run the display tree function using the learned model: <pre>SELECT * FROM <a class="el" href="c45_8sql__in.html#ad7f190eb8e5d53f4772fac699787c0fe">c45_display</a>(
'<em>tree_table_name</em>');</pre> This will display the trained tree in human readable format. <pre> </pre></li>
<li>Run the clean tree function as below: <pre>SELECT * FROM <a class="el" href="c45_8sql__in.html#ac25e17ecbc70149aa559018e718fc793">c45_clean</a>(
'<em>tree_table_name</em>');</pre> This will clean up the learned model and all metadata. <pre> </pre></li>
</ul>
</dd></dl>
<dl class="section user"><dt>Examples</dt><dd><ol type="1">
<li>Prepare an input table/view, e.g.: <pre class="fragment">sql&gt; select * from golf_data order by id;
id | outlook | temperature | humidity | windy | class
----+----------+-------------+----------+--------+--------------
1 | sunny | 85 | 85 | false | Do not Play
2 | sunny | 80 | 90 | true | Do not Play
3 | overcast | 83 | 78 | false | Play
4 | rain | 70 | 96 | false | Play
5 | rain | 68 | 80 | false | Play
6 | rain | 65 | 70 | true | Do not Play
7 | overcast | 64 | 65 | true | Play
8 | sunny | 72 | 95 | false | Do not Play
9 | sunny | 69 | 70 | false | Play
10 | rain | 75 | 80 | false | Play
11 | sunny | 75 | 70 | true | Play
12 | overcast | 72 | 90 | true | Play
13 | overcast | 81 | 75 | false | Play
14 | rain | 71 | 80 | true | Do not Play
(14 rows)</pre></li>
<li>Train the decision tree model, e.g.: <pre class="fragment">sql&gt; SELECT * FROM MADlib.c45_clean('trained_tree_infogain');
sql&gt; SELECT * FROM MADlib.c45_train(
'infogain', -- split criterion_name
'golf_data', -- input table name
'trained_tree_infogain', -- result tree name
null, -- validation table name
'temperature,humidity', -- continuous feature names
'outlook,temperature,humidity,windy', -- feature column names
'id', -- id column name
'class', -- class column name
100, -- confidence level
'explicit', -- missing value preparation
5, -- max tree depth
0.001, -- min percent mode
0.001, -- min percent split
0); -- verbosity
training_set_size | tree_nodes | tree_depth | training_time | split_criterion
-------------------+------------+------------+-----------------+-----------------
14 | 8 | 3 | 00:00:00.871805 | infogain
(1 row)
</pre></li>
<li>Check few rows from the tree model table: <pre class="fragment">sql&gt; select * from trained_tree_infogain order by id;
id | tree_location | feature | probability | ebp_coeff | maxclass | scv | live |sample_size | parent_id | lmc_nid | lmc_fval | is_continuous | split_value
----+---------------+---------+-------------------+-----------+----------+-------------------+------+----------+-----------+---------+----------+-----------------+-------------
1 | {0} | 3 | 0.642857142857143 | 1 | 2 | 0.171033941880327 | 0 | 14 | 0 | 2 | 1 | f |
2 | {0,1} | 4 | 1 | 1 | 2 | 0 | 0 | 4 | 1 | | | f |
3 | {0,2} | 4 | 0.6 | 1 | 2 | 0.673011667009257 | 0 | 5 | 1 | 5 | 1 | f |
4 | {0,3} | 2 | 0.6 | 1 | 1 | 0.673011667009257 | 0 | 5 | 1 | 7 | 1 | t | 70
5 | {0,2,1} | 4 | 1 | 1 | 2 | 0 | 0 | 3 | 3 | | | f |
6 | {0,2,2} | 4 | 1 | 1 | 1 | 0 | 0 | 2 | 3 | | | f |
7 | {0,3,1} | 4 | 1 | 1 | 2 | 0 | 0 | 2 | 4 | | | f |
8 | {0,3,2} | 4 | 1 | 1 | 1 | 0 | 0 | 3 | 4 | | | f |
(8 rows)</pre></li>
<li>To display the tree with human readable format: <pre class="fragment">sql&gt; select MADlib.c45_display('trained_tree_infogain');
c45_display
---------------------------------------------------------------------------------------
Tree 1
Root Node : class( Play) num_elements(14) predict_prob(0.642857142857143)
outlook: = overcast : class( Play) num_elements(4) predict_prob(1)
outlook: = rain : class( Play) num_elements(5) predict_prob(0.6)
windy: = false : class( Play) num_elements(3) predict_prob(1)
windy: = true : class( Do not Play) num_elements(2) predict_prob(1)
outlook: = sunny : class( Do not Play) num_elements(5) predict_prob(0.6)
humidity: &lt;= 70 : class( Play) num_elements(2) predict_prob(1)
humidity: &gt; 70 : class( Do not Play) num_elements(3) predict_prob(1)
(1 row)</pre></li>
<li>To classify data with the learned model: <pre class="fragment">sql&gt; select * from MADlib.c45_classify
'trained_tree_infogain', -- name of the trained model
'golf_data', -- name of the table containing data to classify
'classification_result'); -- name of the output table
input_set_size | classification_time
----------------+-----------------
14 | 00:00:00.247713
(1 row)
</pre></li>
<li>Check classification results: <pre class="fragment">sql&gt; select t.id,t.outlook,t.temperature,t.humidity,t.windy,c.class from
MADlib.classification_result c,golf_data t where t.id=c.id order by id;
id | outlook | temperature | humidity | windy | class
----+----------+-------------+----------+--------+--------------
1 | sunny | 85 | 85 | false | Do not Play
2 | sunny | 80 | 90 | true | Do not Play
3 | overcast | 83 | 78 | false | Play
4 | rain | 70 | 96 | false | Play
5 | rain | 68 | 80 | false | Play
6 | rain | 65 | 70 | true | Do not Play
7 | overcast | 64 | 65 | true | Play
8 | sunny | 72 | 95 | false | Do not Play
9 | sunny | 69 | 70 | false | Play
10 | rain | 75 | 80 | false | Play
11 | sunny | 75 | 70 | true | Play
12 | overcast | 72 | 90 | true | Play
13 | overcast | 81 | 75 | false | Play
14 | rain | 71 | 80 | true | Do not Play
(14 rows)
</pre></li>
<li>Score the data against a validation set: <pre class="fragment">sql&gt; select * from MADlib.c45_score(
'trained_tree_infogain',
'golf_data_validation',
0);
c45_score
-----------
1
(1 row)
</pre></li>
<li>clean up the tree and metadata: <pre class="fragment">testdb=# select MADLIB_SCHEMA.c45_clean('trained_tree_infogain');
c45_clean
-----------
(1 row)
</pre></li>
</ol>
</dd></dl>
<dl class="section user"><dt>Literature</dt><dd></dd></dl>
<p>[1] <a href="http://en.wikipedia.org/wiki/C4.5_algorithm">http://en.wikipedia.org/wiki/C4.5_algorithm</a></p>
<dl class="section see"><dt>See Also</dt><dd>File <a class="el" href="c45_8sql__in.html" title="C4.5 APIs and main controller written in PL/PGSQL. ">c45.sql_in</a> documenting the SQL functions. </dd></dl>
</div><!-- contents -->
</div><!-- doc-content -->
<!-- start footer part -->
<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
<ul>
<li class="footer">Generated on Thu Jan 9 2014 20:35:40 for MADlib by
<a href="http://www.doxygen.org/index.html">
<img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.4 </li>
</ul>
</div>
</body>
</html>