| <!-- HTML header for doxygen 1.8.4--> |
| <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> |
| <html xmlns="http://www.w3.org/1999/xhtml"> |
| <head> |
| <meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/> |
| <meta http-equiv="X-UA-Compatible" content="IE=9"/> |
| <meta name="generator" content="Doxygen 1.8.4"/> |
| <meta name="keywords" content="madlib,postgres,greenplum,machine learning,data mining,deep learning,ensemble methods,data science,market basket analysis,affinity analysis,pca,lda,regression,elastic net,huber white,proportional hazards,k-means,latent dirichlet allocation,bayes,support vector machines,svm"/> |
| <title>MADlib: Decision Tree</title> |
| <link href="tabs.css" rel="stylesheet" type="text/css"/> |
| <script type="text/javascript" src="jquery.js"></script> |
| <script type="text/javascript" src="dynsections.js"></script> |
| <link href="navtree.css" rel="stylesheet" type="text/css"/> |
| <script type="text/javascript" src="resize.js"></script> |
| <script type="text/javascript" src="navtree.js"></script> |
| <script type="text/javascript"> |
| $(document).ready(initResizable); |
| $(window).load(resizeHeight); |
| </script> |
| <link href="search/search.css" rel="stylesheet" type="text/css"/> |
| <script type="text/javascript" src="search/search.js"></script> |
| <script type="text/javascript"> |
| $(document).ready(function() { searchBox.OnSelectItem(0); }); |
| </script> |
| <script type="text/x-mathjax-config"> |
| MathJax.Hub.Config({ |
| extensions: ["tex2jax.js", "TeX/AMSmath.js", "TeX/AMSsymbols.js"], |
| jax: ["input/TeX","output/HTML-CSS"], |
| }); |
| </script><script src="../mathjax/MathJax.js"></script> |
| <link href="doxygen.css" rel="stylesheet" type="text/css" /> |
| <link href="madlib_extra.css" rel="stylesheet" type="text/css"/> |
| <!-- google analytics --> |
| <script> |
| (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){ |
| (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o), |
| m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m) |
| })(window,document,'script','//www.google-analytics.com/analytics.js','ga'); |
| ga('create', 'UA-45382226-1', 'auto'); |
| ga('send', 'pageview'); |
| </script> |
| </head> |
| <body> |
| <div id="top"><!-- do not remove this div, it is closed by doxygen! --> |
| <div id="titlearea"> |
| <table cellspacing="0" cellpadding="0"> |
| <tbody> |
| <tr style="height: 56px;"> |
| <td style="padding-left: 0.5em;"> |
| <div id="projectname">MADlib |
|  <span id="projectnumber">1.2</span> <span style="font-size:10pt; font-style:italic"><a href="../latest/./group__grp__dectree.html"> A newer version is available</a></span> |
| </div> |
| <div id="projectbrief">User Documentation</div> |
| </td> |
| <!--BEGIN VERSIONS LINKS--> |
| <td style="padding-left: 0.5em;"> |
| <div class="versionlist"><ul> |
| <li class="head">More versions:</li> |
| <li><a href="../v1.1/index.html">v1.1</li> |
| <li><a href="../v1.0/index.html">v1.0</li> |
| <li><a href="../v0.7/index.html">v0.7</li> |
| <li><a href="../v0.5/index.html">v0.5</li></ul> |
| </div> |
| </td> |
| <td> <div id="MSearchBox" class="MSearchBoxInactive"> |
| <span class="left"> |
| <img id="MSearchSelect" src="search/mag_sel.png" |
| onmouseover="return searchBox.OnSearchSelectShow()" |
| onmouseout="return searchBox.OnSearchSelectHide()" |
| alt=""/> |
| <input type="text" id="MSearchField" value="Search" accesskey="S" |
| onfocus="searchBox.OnSearchFieldFocus(true)" |
| onblur="searchBox.OnSearchFieldFocus(false)" |
| onkeyup="searchBox.OnSearchFieldChange(event)"/> |
| </span><span class="right"> |
| <a id="MSearchClose" href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" border="0" src="search/close.png" alt=""/></a> |
| </span> |
| </div> |
| </td> |
| </tr> |
| </tbody> |
| </table> |
| </div> |
| <!-- end header part --> |
| <!-- Generated by Doxygen 1.8.4 --> |
| <script type="text/javascript"> |
| var searchBox = new SearchBox("searchBox", "search",false,'Search'); |
| </script> |
| </div><!-- top --> |
| <div id="side-nav" class="ui-resizable side-nav-resizable"> |
| <div id="nav-tree"> |
| <div id="nav-tree-contents"> |
| <div id="nav-sync" class="sync"></div> |
| </div> |
| </div> |
| <div id="splitbar" style="-moz-user-select:none;" |
| class="ui-resizable-handle"> |
| </div> |
| </div> |
| <script type="text/javascript"> |
| $(document).ready(function(){initNavTree('group__grp__dectree.html','');}); |
| </script> |
| <div id="doc-content"> |
| <!-- window showing the filter options --> |
| <div id="MSearchSelectWindow" |
| onmouseover="return searchBox.OnSearchSelectShow()" |
| onmouseout="return searchBox.OnSearchSelectHide()" |
| onkeydown="return searchBox.OnSearchSelectKey(event)"> |
| <a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(0)"><span class="SelectionMark"> </span>All</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(1)"><span class="SelectionMark"> </span>Files</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(2)"><span class="SelectionMark"> </span>Functions</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(3)"><span class="SelectionMark"> </span>Variables</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(4)"><span class="SelectionMark"> </span>Groups</a></div> |
| |
| <!-- iframe showing the search results (closed by default) --> |
| <div id="MSearchResultsWindow"> |
| <iframe src="javascript:void(0)" frameborder="0" |
| name="MSearchResults" id="MSearchResults"> |
| </iframe> |
| </div> |
| |
| <div class="header"> |
| <div class="headertitle"> |
| <div class="title">Decision Tree<div class="ingroups"><a class="el" href="group__grp__early__stage.html">Early Stage Development</a></div></div> </div> |
| </div><!--header--> |
| <div class="contents"> |
| <dl class="section warning"><dt>Warning</dt><dd><em> This MADlib method is still in early stage development. There may be some issues that will be addressed in a future version. Interface and implementation is subject to change. </em></dd></dl> |
| <dl class="section user"><dt>About</dt><dd></dd></dl> |
| <p>This module provides an implementation of the C4.5 implementation to grow decision trees.</p> |
| <p>The implementation supports:</p> |
| <ul> |
| <li>Building decision tree</li> |
| <li>Multiple split critera, including: . Information Gain . Gini Coefficient . Gain Ratio</li> |
| <li>Decision tree Pruning</li> |
| <li>Decision tree classification/scoring</li> |
| <li>Decision tree display</li> |
| <li>Rule generation</li> |
| <li>Continuous and discrete features</li> |
| <li>Missing value handling</li> |
| </ul> |
| <dl class="section user"><dt>Input</dt><dd></dd></dl> |
| <p>The <b>training data</b> is expected to be of the following form: </p> |
| <pre>{TABLE|VIEW} <em>trainingSource</em> ( |
| ... |
| <em>id</em> INT|BIGINT, |
| <em>feature1</em> SUPPORTED_DATA_TYPE, |
| <em>feature2</em> SUPPORTED_DATA_TYPE, |
| <em>feature3</em> SUPPORTED_DATA_TYPE, |
| .................... |
| <em>featureN</em> SUPPORTED_DATA_TYPE, |
| <em>class</em> SUPPORTED_DATA_TYPE, |
| ... |
| )</pre><p>The detailed list of SUPPORTED_DATA_TYPE is: SMALLINT, INT, BIGINT, FLOAT8, REAL, DECIMAL, INET, CIDR, MACADDR, BOOLEAN, CHAR, VARCHAR, TEXT, "char", DATE, TIME, TIMETZ, TIMESTAMP, TIMESTAMPTZ, and INTERVAL.</p> |
| <p>The <b>data to classify</b> is expected to be of the same form as <b>training data</b>, except that it does not need a class column.</p> |
| <dl class="section user"><dt>Usage</dt><dd><ul> |
| <li>Run the training algorithm on the source data: <pre>SELECT * FROM <a class="el" href="c45_8sql__in.html#a18b30ff1a063e7cd16274bf7ab2a71dc">c45_train</a>( |
| '<em>split_criterion</em>', |
| '<em>training_table_name</em>', |
| '<em>result_tree_table_name</em>', |
| '<em>validation_table_name</em>', |
| '<em>continuous_feature_names</em>', |
| '<em>feature_col_names</em>', |
| '<em>id_col_name</em>', |
| '<em>class_col_name</em>', |
| '<em>confidence_level</em>', |
| '<em>how2handle_missing_value</em>' |
| '<em>max_tree_depth</em>', |
| '<em>node_prune_threshold</em>', |
| '<em>node_split_threshold</em>' |
| '<em>verbosity</em>'); |
| </pre> This will create the decision tree output table storing an abstract object (representing the model) used for further classification. Column names: <pre> |
| id | tree_location | feature | probability | ebp_coeff | maxclass | scv | live | sample_size | parent_id | lmc_nid | lmc_fval | is_continuous | split_value | tid | dp_ids |
| ----+---------------+---------+-------------------+------------------+----------+-------------------+------+-----------+-----------+---------+----------+-----------------+-------------+-----+-------- |
| ...</pre></li> |
| <li>Run the classification function using the learned model: <pre>SELECT * FROM <a class="el" href="c45_8sql__in.html#af5eb174eeecd11233409657221586cf1">c45_classify</a>( |
| '<em>tree_table_name</em>', |
| '<em>classification_table_name</em>', |
| '<em>result_table_name</em>');</pre> This will create the result_table with the classification results. <pre> </pre></li> |
| <li>Run the scorinf function to score the learned model against a validation data set: <pre>SELECT * FROM <a class="el" href="c45_8sql__in.html#af0739749507c1097003dcf529d29fee2">c45_score</a>( |
| '<em>tree_table_name</em>', |
| '<em>validation_table_name</em>', |
| '<em>verbosity</em>');</pre> This will give a ratio of correctly classified items in the validation set. <pre> </pre></li> |
| <li>Run the display tree function using the learned model: <pre>SELECT * FROM <a class="el" href="c45_8sql__in.html#ad7f190eb8e5d53f4772fac699787c0fe">c45_display</a>( |
| '<em>tree_table_name</em>');</pre> This will display the trained tree in human readable format. <pre> </pre></li> |
| <li>Run the clean tree function as below: <pre>SELECT * FROM <a class="el" href="c45_8sql__in.html#ac25e17ecbc70149aa559018e718fc793">c45_clean</a>( |
| '<em>tree_table_name</em>');</pre> This will clean up the learned model and all metadata. <pre> </pre></li> |
| </ul> |
| </dd></dl> |
| <dl class="section user"><dt>Examples</dt><dd><ol type="1"> |
| <li>Prepare an input table/view, e.g.: <pre class="fragment">sql> select * from golf_data order by id; |
| id | outlook | temperature | humidity | windy | class |
| ----+----------+-------------+----------+--------+-------------- |
| 1 | sunny | 85 | 85 | false | Do not Play |
| 2 | sunny | 80 | 90 | true | Do not Play |
| 3 | overcast | 83 | 78 | false | Play |
| 4 | rain | 70 | 96 | false | Play |
| 5 | rain | 68 | 80 | false | Play |
| 6 | rain | 65 | 70 | true | Do not Play |
| 7 | overcast | 64 | 65 | true | Play |
| 8 | sunny | 72 | 95 | false | Do not Play |
| 9 | sunny | 69 | 70 | false | Play |
| 10 | rain | 75 | 80 | false | Play |
| 11 | sunny | 75 | 70 | true | Play |
| 12 | overcast | 72 | 90 | true | Play |
| 13 | overcast | 81 | 75 | false | Play |
| 14 | rain | 71 | 80 | true | Do not Play |
| (14 rows)</pre></li> |
| <li>Train the decision tree model, e.g.: <pre class="fragment">sql> SELECT * FROM MADlib.c45_clean('trained_tree_infogain'); |
| sql> SELECT * FROM MADlib.c45_train( |
| 'infogain', -- split criterion_name |
| 'golf_data', -- input table name |
| 'trained_tree_infogain', -- result tree name |
| null, -- validation table name |
| 'temperature,humidity', -- continuous feature names |
| 'outlook,temperature,humidity,windy', -- feature column names |
| 'id', -- id column name |
| 'class', -- class column name |
| 100, -- confidence level |
| 'explicit', -- missing value preparation |
| 5, -- max tree depth |
| 0.001, -- min percent mode |
| 0.001, -- min percent split |
| 0); -- verbosity |
| training_set_size | tree_nodes | tree_depth | training_time | split_criterion |
| -------------------+------------+------------+-----------------+----------------- |
| 14 | 8 | 3 | 00:00:00.871805 | infogain |
| (1 row) |
| </pre></li> |
| <li>Check few rows from the tree model table: <pre class="fragment">sql> select * from trained_tree_infogain order by id; |
| id | tree_location | feature | probability | ebp_coeff | maxclass | scv | live |sample_size | parent_id | lmc_nid | lmc_fval | is_continuous | split_value |
| ----+---------------+---------+-------------------+-----------+----------+-------------------+------+----------+-----------+---------+----------+-----------------+------------- |
| 1 | {0} | 3 | 0.642857142857143 | 1 | 2 | 0.171033941880327 | 0 | 14 | 0 | 2 | 1 | f | |
| 2 | {0,1} | 4 | 1 | 1 | 2 | 0 | 0 | 4 | 1 | | | f | |
| 3 | {0,2} | 4 | 0.6 | 1 | 2 | 0.673011667009257 | 0 | 5 | 1 | 5 | 1 | f | |
| 4 | {0,3} | 2 | 0.6 | 1 | 1 | 0.673011667009257 | 0 | 5 | 1 | 7 | 1 | t | 70 |
| 5 | {0,2,1} | 4 | 1 | 1 | 2 | 0 | 0 | 3 | 3 | | | f | |
| 6 | {0,2,2} | 4 | 1 | 1 | 1 | 0 | 0 | 2 | 3 | | | f | |
| 7 | {0,3,1} | 4 | 1 | 1 | 2 | 0 | 0 | 2 | 4 | | | f | |
| 8 | {0,3,2} | 4 | 1 | 1 | 1 | 0 | 0 | 3 | 4 | | | f | |
| (8 rows)</pre></li> |
| <li>To display the tree with human readable format: <pre class="fragment">sql> select MADlib.c45_display('trained_tree_infogain'); |
| c45_display |
| --------------------------------------------------------------------------------------- |
| Tree 1 |
| Root Node : class( Play) num_elements(14) predict_prob(0.642857142857143) |
| outlook: = overcast : class( Play) num_elements(4) predict_prob(1) |
| outlook: = rain : class( Play) num_elements(5) predict_prob(0.6) |
| windy: = false : class( Play) num_elements(3) predict_prob(1) |
| windy: = true : class( Do not Play) num_elements(2) predict_prob(1) |
| outlook: = sunny : class( Do not Play) num_elements(5) predict_prob(0.6) |
| humidity: <= 70 : class( Play) num_elements(2) predict_prob(1) |
| humidity: > 70 : class( Do not Play) num_elements(3) predict_prob(1) |
| (1 row)</pre></li> |
| <li>To classify data with the learned model: <pre class="fragment">sql> select * from MADlib.c45_classify |
| 'trained_tree_infogain', -- name of the trained model |
| 'golf_data', -- name of the table containing data to classify |
| 'classification_result'); -- name of the output table |
| input_set_size | classification_time |
| ----------------+----------------- |
| 14 | 00:00:00.247713 |
| (1 row) |
| </pre></li> |
| <li>Check classification results: <pre class="fragment">sql> select t.id,t.outlook,t.temperature,t.humidity,t.windy,c.class from |
| MADlib.classification_result c,golf_data t where t.id=c.id order by id; |
| id | outlook | temperature | humidity | windy | class |
| ----+----------+-------------+----------+--------+-------------- |
| 1 | sunny | 85 | 85 | false | Do not Play |
| 2 | sunny | 80 | 90 | true | Do not Play |
| 3 | overcast | 83 | 78 | false | Play |
| 4 | rain | 70 | 96 | false | Play |
| 5 | rain | 68 | 80 | false | Play |
| 6 | rain | 65 | 70 | true | Do not Play |
| 7 | overcast | 64 | 65 | true | Play |
| 8 | sunny | 72 | 95 | false | Do not Play |
| 9 | sunny | 69 | 70 | false | Play |
| 10 | rain | 75 | 80 | false | Play |
| 11 | sunny | 75 | 70 | true | Play |
| 12 | overcast | 72 | 90 | true | Play |
| 13 | overcast | 81 | 75 | false | Play |
| 14 | rain | 71 | 80 | true | Do not Play |
| (14 rows) |
| </pre></li> |
| <li>Score the data against a validation set: <pre class="fragment">sql> select * from MADlib.c45_score( |
| 'trained_tree_infogain', |
| 'golf_data_validation', |
| 0); |
| c45_score |
| ----------- |
| 1 |
| (1 row) |
| </pre></li> |
| <li>clean up the tree and metadata: <pre class="fragment">testdb=# select MADLIB_SCHEMA.c45_clean('trained_tree_infogain'); |
| c45_clean |
| ----------- |
| |
| (1 row) |
| </pre></li> |
| </ol> |
| </dd></dl> |
| <dl class="section user"><dt>Literature</dt><dd></dd></dl> |
| <p>[1] <a href="http://en.wikipedia.org/wiki/C4.5_algorithm">http://en.wikipedia.org/wiki/C4.5_algorithm</a></p> |
| <dl class="section see"><dt>See Also</dt><dd>File <a class="el" href="c45_8sql__in.html" title="C4.5 APIs and main controller written in PL/PGSQL. ">c45.sql_in</a> documenting the SQL functions. </dd></dl> |
| </div><!-- contents --> |
| </div><!-- doc-content --> |
| <!-- start footer part --> |
| <div id="nav-path" class="navpath"><!-- id is needed for treeview function! --> |
| <ul> |
| <li class="footer">Generated on Thu Jan 9 2014 20:35:40 for MADlib by |
| <a href="http://www.doxygen.org/index.html"> |
| <img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.4 </li> |
| </ul> |
| </div> |
| </body> |
| </html> |