blob: adcf8a30570ec42268820bbef6f80d0e6399f2f0 [file] [log] [blame]
<!-- HTML header for doxygen 1.8.4-->
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
<meta http-equiv="X-UA-Compatible" content="IE=9"/>
<meta name="generator" content="Doxygen 1.8.4"/>
<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data mining,deep learning,ensemble methods,data science,market basket analysis,affinity analysis,pca,lda,regression,elastic net,huber white,proportional hazards,k-means,latent dirichlet allocation,bayes,support vector machines,svm"/>
<title>MADlib: Decision Tree (old C4.5 implementation)</title>
<link href="tabs.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript" src="jquery.js"></script>
<script type="text/javascript" src="dynsections.js"></script>
<link href="navtree.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript" src="resize.js"></script>
<script type="text/javascript" src="navtree.js"></script>
<script type="text/javascript">
$(document).ready(initResizable);
$(window).load(resizeHeight);
</script>
<link href="search/search.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript" src="search/search.js"></script>
<script type="text/javascript">
$(document).ready(function() { searchBox.OnSelectItem(0); });
</script>
<script type="text/x-mathjax-config">
MathJax.Hub.Config({
extensions: ["tex2jax.js", "TeX/AMSmath.js", "TeX/AMSsymbols.js"],
jax: ["input/TeX","output/HTML-CSS"],
});
</script><script src="../mathjax/MathJax.js"></script>
<link href="doxygen.css" rel="stylesheet" type="text/css" />
<link href="madlib_extra.css" rel="stylesheet" type="text/css"/>
<!-- google analytics -->
<script>
(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
})(window,document,'script','//www.google-analytics.com/analytics.js','ga');
ga('create', 'UA-45382226-1', 'auto');
ga('send', 'pageview');
</script>
</head>
<body>
<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
<div id="titlearea">
<table cellspacing="0" cellpadding="0">
<tbody>
<tr style="height: 56px;">
<td id="projectlogo"><a href="http://madlib.incubator.apache.org"><img alt="Logo" src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ ></a></td>
<td style="padding-left: 0.5em;">
<div id="projectname">
<span id="projectnumber">1.7</span> <span style="font-size:10pt; font-style:italic"><a href="../latest/./group__grp__dectree.html"> A newer version is available</a></span>
</div>
<div id="projectbrief">User Documentation</div>
</td>
<td> <div id="MSearchBox" class="MSearchBoxInactive">
<span class="left">
<img id="MSearchSelect" src="search/mag_sel.png"
onmouseover="return searchBox.OnSearchSelectShow()"
onmouseout="return searchBox.OnSearchSelectHide()"
alt=""/>
<input type="text" id="MSearchField" value="Search" accesskey="S"
onfocus="searchBox.OnSearchFieldFocus(true)"
onblur="searchBox.OnSearchFieldFocus(false)"
onkeyup="searchBox.OnSearchFieldChange(event)"/>
</span><span class="right">
<a id="MSearchClose" href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" border="0" src="search/close.png" alt=""/></a>
</span>
</div>
</td>
</tr>
</tbody>
</table>
</div>
<!-- end header part -->
<!-- Generated by Doxygen 1.8.4 -->
<script type="text/javascript">
var searchBox = new SearchBox("searchBox", "search",false,'Search');
</script>
</div><!-- top -->
<div id="side-nav" class="ui-resizable side-nav-resizable">
<div id="nav-tree">
<div id="nav-tree-contents">
<div id="nav-sync" class="sync"></div>
</div>
</div>
<div id="splitbar" style="-moz-user-select:none;"
class="ui-resizable-handle">
</div>
</div>
<script type="text/javascript">
$(document).ready(function(){initNavTree('group__grp__dectree.html','');});
</script>
<div id="doc-content">
<!-- window showing the filter options -->
<div id="MSearchSelectWindow"
onmouseover="return searchBox.OnSearchSelectShow()"
onmouseout="return searchBox.OnSearchSelectHide()"
onkeydown="return searchBox.OnSearchSelectKey(event)">
<a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(0)"><span class="SelectionMark">&#160;</span>All</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(1)"><span class="SelectionMark">&#160;</span>Files</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(2)"><span class="SelectionMark">&#160;</span>Functions</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(3)"><span class="SelectionMark">&#160;</span>Groups</a></div>
<!-- iframe showing the search results (closed by default) -->
<div id="MSearchResultsWindow">
<iframe src="javascript:void(0)" frameborder="0"
name="MSearchResults" id="MSearchResults">
</iframe>
</div>
<div class="header">
<div class="headertitle">
<div class="title">Decision Tree (old C4.5 implementation)<div class="ingroups"><a class="el" href="group__grp__deprecated.html">Deprecated Modules</a></div></div> </div>
</div><!--header-->
<div class="contents">
<div class="toc"><b>Contents</b> </p>
<ul>
<li>
<a href="#input">Input</a> </li>
<li>
<a href="#train">Training Function</a> </li>
<li>
<a href="#classify">Classification Function</a> </li>
<li>
<a href="#score">Scoring Function</a> </li>
<li>
<a href="#display">Display Tree Function</a> </li>
<li>
<a href="#notes">Implementation Notes</a> </li>
<li>
<a href="#examples">Examples</a> </li>
<li>
<a href="#related">Related Topics</a> </li>
</ul>
</div><dl class="section warning"><dt>Warning</dt><dd><em> This is an old implementation of decision trees. For a newer implementation, please see <a class="el" href="group__grp__decision__tree.html">Decision Tree</a></em></dd></dl>
<p>This module provides an implementation of the C4.5 algorithm to grow decision trees.</p>
<p>The implementation supports:</p>
<ul>
<li>Building the decision tree</li>
<li>Multiple split critera, including: &ndash; Information Gain &ndash; Gini Coefficient &ndash; Gain Ratio</li>
<li>Decision tree Pruning</li>
<li>Decision tree classification/scoring</li>
<li>Decision tree display</li>
<li>Rule generation</li>
<li>Continuous and discrete features</li>
<li>Missing value handling</li>
</ul>
<p><a class="anchor" id="train"></a></p>
<dl class="section user"><dt>Training Function</dt><dd></dd></dl>
<p>Run the training algorithm on the source data: </p>
<pre class="syntax">
c45_train( split_criterion,
training_table_name,
result_tree_table_name,
validation_table_name,
continuous_feature_names,
feature_col_names,
id_col_name,
class_col_name,
confidence_level,
how2handle_missing_value,
max_tree_depth,
node_prune_threshold,
node_split_threshold,
verbosity
)
</pre><p> <b>Arguments</b> </p>
<dl class="arglist">
<dt>split_criterion </dt>
<dd>The name of the split criterion that should be used for tree construction. The valid values are ‘infogain’, ‘gainratio’, and ‘gini’. It can't be NULL. Information gain(infogain) and gini index(gini) are biased toward multivalued attributes. Gain ratio(gainratio) adjusts for this bias. However, it tends to prefer unbalanced splits in which one partition is much smaller than the others. </dd>
<dt>training_table_name </dt>
<dd>The name of the table/view with the source data. The <b>training data</b> is expected to be of the following form: <pre>{TABLE|VIEW} <em>trainingSource</em> (
...
<em>id</em> INT|BIGINT,
<em>feature1</em> SUPPORTED_DATA_TYPE,
<em>feature2</em> SUPPORTED_DATA_TYPE,
<em>feature3</em> SUPPORTED_DATA_TYPE,
....................
<em>featureN</em> SUPPORTED_DATA_TYPE,
<em>class</em> SUPPORTED_DATA_TYPE,
...
)</pre> The detailed list of SUPPORTED_DATA_TYPE is: SMALLINT, INT, BIGINT, FLOAT8, REAL, DECIMAL, INET, CIDR, MACADDR, BOOLEAN, CHAR, VARCHAR, TEXT, "char", DATE, TIME, TIMETZ, TIMESTAMP, TIMESTAMPTZ, and INTERVAL. </dd>
<dt>result_tree_table_name </dt>
<dd>The name of the table to contain the decision tree output. The table stores an abstract object (representing the model) used for further classification. It has the following columns: <table class="output">
<tr>
<th>id </th></tr>
<tr>
<th>tree_location </th></tr>
<tr>
<th>feature </th></tr>
<tr>
<th>probability </th></tr>
<tr>
<th>ebp_coeff </th></tr>
<tr>
<th>maxclass </th></tr>
<tr>
<th>scv </th></tr>
<tr>
<th>live </th></tr>
<tr>
<th>sample_size </th></tr>
<tr>
<th>parent_id </th></tr>
<tr>
<th>lmc_nid </th></tr>
<tr>
<th>lmc_fval </th></tr>
<tr>
<th>is_continuous </th></tr>
<tr>
<th>split_value </th></tr>
<tr>
<th>tid </th></tr>
<tr>
<th>dp_ids </th></tr>
</table>
</dd>
<dt>validation_table_name </dt>
<dd>The name of the table/view that contains the validation set used for tree pruning. The default is NULL, in which case we will not do tree pruning. </dd>
<dt>continuous_feature_names </dt>
<dd>A comma-separated list of the names of features whose values are continuous. The default is null, which means there are no continuous features in the training table. </dd>
<dt>feature_col_names </dt>
<dd>A comma-separated list of the names of table columns, each of which defines a feature. The default value is null, which means all the columns in the training table, except columns named ‘id’ and ‘class’, will be used as features. </dd>
<dt>id_col_name </dt>
<dd>The name of the column containing an ID for each record. </dd>
<dt>class_col_name </dt>
<dd>The name of the column containing the labeled class. </dd>
<dt>confidence_level </dt>
<dd>A statistical confidence interval of the resubstitution error. </dd>
<dt>how2handle_missing_value </dt>
<dd>The way to handle missing value. The valid value is 'explicit' or 'ignore'. </dd>
<dt>max_tree_depth </dt>
<dd>Specifies the maximum number of levels in the result DT to avoid overgrown DTs. </dd>
<dt>node_prune_threshold </dt>
<dd>The minimum percentage of the number of records required in a child node. It can't be NULL. The range of it is in [0.0, 1.0]. This threshold only applies to the non-root nodes. Therefore, if its value is 1, then the trained tree only has one node (the root node); if its value is 0, then no nodes will be pruned by this parameter. </dd>
<dt>node_split_threshold </dt>
<dd>The minimum percentage of the number of records required in a node in order for a further split to be possible. It can't be NULL. The range of it is in [0.0, 1.0]. If it's value is 1, then the trained tree only has two levels, since only the root node can grow; if its value is 0, then trees can grow extensively. </dd>
<dt>verbosity </dt>
<dd>An integer greater than 0 means this function runs in verbose mode. </dd>
</dl>
<p><a class="anchor" id="classify"></a></p>
<dl class="section user"><dt>Classification Function</dt><dd></dd></dl>
<p>The classification function uses the learned model stored by the training function to create the classification results. </p>
<pre class="syntax">
c45_classify( tree_table_name,
classification_table_name,
result_table_name
)
</pre><p> <b>Arguments</b> </p>
<dl class="arglist">
<dt>tree_table_name </dt>
<dd><p class="startdd">The name of the table containing the trained model.</p>
<p class="enddd">The data to classify is expected to be in the same form as the training data, except that it does not need a class column. </p>
</dd>
<dt>classification_table_name </dt>
<dd>The name of the table containing the data to classify. </dd>
<dt>result_table_name </dt>
<dd>The name of the output table. </dd>
</dl>
<p><a class="anchor" id="score"></a></p>
<dl class="section user"><dt>Scoring Function</dt><dd>The scoring function scores the learned model against a validation data set. <pre class="syntax">
c45_score( tree_table_name,
validation_table_name,
verbosity
);
</pre></dd></dl>
<p>This gives a ratio of correctly classified items in the validation set.</p>
<p><a class="anchor" id="display"></a></p>
<dl class="section user"><dt>Display Tree Function</dt><dd></dd></dl>
<p>The display tree function displays the learned model in a human-readable format.</p>
<pre class="syntax">
c45_display( tree_table_name
);
</pre><p><a class="anchor" id="clean"></a></p>
<dl class="section user"><dt>Clean Tree Function</dt><dd></dd></dl>
<p>The clean tree function cleans up the learned model and all metadata. </p>
<pre class="syntax">
c45_clean( tree_table_name
);
</pre><p><a class="anchor" id="notes"></a></p>
<dl class="section user"><dt>Implementation Notes</dt><dd></dd></dl>
<p>Due to some implementation difference, decisiont tree on HAWQ is much slower than on Greenplum database when running on small data sets. However, for larger data sets, the performance difference is much smaller. For example, in a test with 0.75 million rows of data, decision tree on HAWQ is only one time slower than on GPDB. This is because the overhead due to the different implementation is proportional to the tree size, and is usually negligible as data size increases (The tree size is not likely to increase proportionally with the data size. For example, if a 10-node tree is used to fit a data set with 1000 rows, it is very unlikely to fit another data set with 1 million rows with a 10000-node tree).</p>
<p><a class="anchor" id="examples"></a></p>
<dl class="section user"><dt>Examples</dt><dd><ol type="1">
<li>Prepare an input table. <pre class="example">
SELECT * FROM golf_data ORDER BY id;
</pre> Result: <pre class="result">
id | outlook | temperature | humidity | windy | class
&#160;---+----------+-------------+----------+--------+--------------
1 | sunny | 85 | 85 | false | Do not Play
2 | sunny | 80 | 90 | true | Do not Play
3 | overcast | 83 | 78 | false | Play
4 | rain | 70 | 96 | false | Play
5 | rain | 68 | 80 | false | Play
6 | rain | 65 | 70 | true | Do not Play
7 | overcast | 64 | 65 | true | Play
8 | sunny | 72 | 95 | false | Do not Play
9 | sunny | 69 | 70 | false | Play
10 | rain | 75 | 80 | false | Play
11 | sunny | 75 | 70 | true | Play
12 | overcast | 72 | 90 | true | Play
13 | overcast | 81 | 75 | false | Play
14 | rain | 71 | 80 | true | Do not Play
(14 rows)
</pre></li>
<li>Train the decision tree model. Run the <a class="el" href="c45_8sql__in.html#ac25e17ecbc70149aa559018e718fc793" title="Cleanup the trained tree table and any relevant tables. ">c45_clean()</a> function first to clean up any model and metadata from previous executions. <pre class="example">
SELECT * FROM madlib.c45_clean( 'trained_tree_infogain'
);
SELECT * FROM madlib.c45_train( 'infogain',
'golf_data',
'trained_tree_infogain',
null,
'temperature,humidity',
'outlook,temperature,humidity,windy',
'id',
'class',
100,
'explicit',
5,
0.001,
0.001,
0
);
</pre> Result: <pre class="result">
training_set_size | tree_nodes | tree_depth | training_time | split_criterion
&#160;------------------+------------+------------+-----------------+-----------------
14 | 8 | 3 | 00:00:00.871805 | infogain
(1 row)
</pre></li>
<li>View the the tree model table. <pre class="example">
SELECT * FROM trained_tree_infogain ORDER BY id;
</pre> Result: <pre class="result">
id | tree_location | feature | probability | ebp_coeff | maxclass | scv | live |sample_size | parent_id | lmc_nid | lmc_fval | is_continuous | split_value
&#160;---+---------------+---------+-------------------+-----------+----------+-------------------+------+----------+-----------+---------+----------+-----------------+-------------
1 | {0} | 3 | 0.642857142857143 | 1 | 2 | 0.171033941880327 | 0 | 14 | 0 | 2 | 1 | f |
2 | {0,1} | 4 | 1 | 1 | 2 | 0 | 0 | 4 | 1 | | | f |
3 | {0,2} | 4 | 0.6 | 1 | 2 | 0.673011667009257 | 0 | 5 | 1 | 5 | 1 | f |
4 | {0,3} | 2 | 0.6 | 1 | 1 | 0.673011667009257 | 0 | 5 | 1 | 7 | 1 | t | 70
5 | {0,2,1} | 4 | 1 | 1 | 2 | 0 | 0 | 3 | 3 | | | f |
6 | {0,2,2} | 4 | 1 | 1 | 1 | 0 | 0 | 2 | 3 | | | f |
7 | {0,3,1} | 4 | 1 | 1 | 2 | 0 | 0 | 2 | 4 | | | f |
8 | {0,3,2} | 4 | 1 | 1 | 1 | 0 | 0 | 3 | 4 | | | f |
(8 rows)
</pre></li>
<li>Display the tree with a human readable format: <pre class="example">
SELECT madlib.c45_display('trained_tree_infogain');
</pre> Result: <pre class="result">
c45_display
&#160;--------------------------------------------------------------------------------------
Tree 1
Root Node : class( Play) num_elements(14) predict_prob(0.642857142857143)
outlook: = overcast : class( Play) num_elements(4) predict_prob(1)
outlook: = rain : class( Play) num_elements(5) predict_prob(0.6)
windy: = false : class( Play) num_elements(3) predict_prob(1)
windy: = true : class( Do not Play) num_elements(2) predict_prob(1)
outlook: = sunny : class( Do not Play) num_elements(5) predict_prob(0.6)
humidity: &lt;= 70 : class( Play) num_elements(2) predict_prob(1)
humidity: &gt; 70 : class( Do not Play) num_elements(3) predict_prob(1)
(1 row)
</pre></li>
<li>Classify some data with the learned model. <pre class="example">
SELECT * FROM madlib.c45_classify ( 'trained_tree_infogain',
'golf_data',
'classification_result'
);
</pre> Result: <pre class="result">
input_set_size | classification_time
----------------+-----------------
14 | 00:00:00.247713
(1 row)
</pre></li>
<li>Check the classification results. <pre class="example">
SELECT t.id, t.outlook, t.temperature, t.humidity, t.windy, c.class
FROM madlib.classification_result c, golf_data t
WHERE t.id=c.id ORDER BY id;
</pre> Result: <pre class="result">
id | outlook | temperature | humidity | windy | class
&#160;---+----------+-------------+----------+--------+--------------
1 | sunny | 85 | 85 | false | Do not Play
2 | sunny | 80 | 90 | true | Do not Play
3 | overcast | 83 | 78 | false | Play
4 | rain | 70 | 96 | false | Play
5 | rain | 68 | 80 | false | Play
6 | rain | 65 | 70 | true | Do not Play
7 | overcast | 64 | 65 | true | Play
8 | sunny | 72 | 95 | false | Do not Play
9 | sunny | 69 | 70 | false | Play
10 | rain | 75 | 80 | false | Play
11 | sunny | 75 | 70 | true | Play
12 | overcast | 72 | 90 | true | Play
13 | overcast | 81 | 75 | false | Play
14 | rain | 71 | 80 | true | Do not Play
(14 rows)
</pre></li>
<li>Score the data against a validation set. <pre class="example">
SELECT * FROM madlib.c45_score( 'trained_tree_infogain',
'golf_data_validation',
0)
);
</pre> Result: <pre class="result">
c45_score
&#160;----------
1
(1 row)
</pre></li>
<li>Clean up the tree and metadata. <pre class="example">
SELECT madlib.c45_clean( 'trained_tree_infogain'
);
</pre> Result: <pre class="result">
c45_clean
&#160;----------
&#160;
(1 row)
</pre></li>
</ol>
</dd></dl>
<p><a class="anchor" id="literature"></a></p>
<dl class="section user"><dt>Literature</dt><dd></dd></dl>
<p>[1] <a href="http://en.wikipedia.org/wiki/C4.5_algorithm">http://en.wikipedia.org/wiki/C4.5_algorithm</a></p>
<p><a class="anchor" id="related"></a></p>
<dl class="section user"><dt>Related Topics</dt><dd>File <a class="el" href="c45_8sql__in.html" title="C4.5 APIs and main controller written in PL/PGSQL. ">c45.sql_in</a> documenting the SQL functions. </dd></dl>
</div><!-- contents -->
</div><!-- doc-content -->
<!-- start footer part -->
<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
<ul>
<li class="footer">Generated on Tue Dec 30 2014 21:44:13 for MADlib by
<a href="http://www.doxygen.org/index.html">
<img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.4 </li>
</ul>
</div>
</body>
</html>