<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
<title>MADlib: rf.sql_in Source File</title>

<link href="tabs.css" rel="stylesheet" type="text/css"/>
<link href="doxygen.css" rel="stylesheet" type="text/css" />
<link href="navtree.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript" src="jquery.js"></script>
<script type="text/javascript" src="resize.js"></script>
<script type="text/javascript" src="navtree.js"></script>
<script type="text/javascript">
  $(document).ready(initResizable);
</script>
<link href="search/search.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript" src="search/search.js"></script>
<script type="text/javascript">
  $(document).ready(function() { searchBox.OnSelectItem(0); });
</script>
<script src="../mathjax/MathJax.js">
  MathJax.Hub.Config({
    extensions: ["tex2jax.js", "TeX/AMSmath.js", "TeX/AMSsymbols.js"],
    jax: ["input/TeX","output/HTML-CSS"],
});
</script>
</head>
<body>
<div id="top"><!-- do not remove this div! -->


<div id="titlearea">
<table cellspacing="0" cellpadding="0">
 <tbody>
 <tr style="height: 56px;">
  
  
  <td style="padding-left: 0.5em;">
   <div id="projectname">MADlib
   &#160;<span id="projectnumber">0.6</span> <span style="font-size:10pt; font-style:italic"><a href="../latest/./rf_8sql__in_source.html"> A newer version is available</a></span>
   </div>
   <div id="projectbrief">User Documentation</div>
  </td>
  
  
  
 </tr>
 </tbody>
</table>
</div>

<!-- Generated by Doxygen 1.7.5.1 -->
<script type="text/javascript">
var searchBox = new SearchBox("searchBox", "search",false,'Search');
</script>
<script type="text/javascript" src="dynsections.js"></script>
  <div id="navrow1" class="tabs">
    <ul class="tablist">
      <li><a href="index.html"><span>Main&#160;Page</span></a></li>
      <li><a href="modules.html"><span>Modules</span></a></li>
      <li class="current"><a href="files.html"><span>Files</span></a></li>
      <li>
        <div id="MSearchBox" class="MSearchBoxInactive">
        <span class="left">
          <img id="MSearchSelect" src="search/mag_sel.png"
               onmouseover="return searchBox.OnSearchSelectShow()"
               onmouseout="return searchBox.OnSearchSelectHide()"
               alt=""/>
          <input type="text" id="MSearchField" value="Search" accesskey="S"
               onfocus="searchBox.OnSearchFieldFocus(true)" 
               onblur="searchBox.OnSearchFieldFocus(false)" 
               onkeyup="searchBox.OnSearchFieldChange(event)"/>
          </span><span class="right">
            <a id="MSearchClose" href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" border="0" src="search/close.png" alt=""/></a>
          </span>
        </div>
      </li>
    </ul>
  </div>
  <div id="navrow2" class="tabs2">
    <ul class="tablist">
      <li><a href="files.html"><span>File&#160;List</span></a></li>
      <li><a href="globals.html"><span>File&#160;Members</span></a></li>
    </ul>
  </div>
</div>
<div id="side-nav" class="ui-resizable side-nav-resizable">
  <div id="nav-tree">
    <div id="nav-tree-contents">
    </div>
  </div>
  <div id="splitbar" style="-moz-user-select:none;" 
       class="ui-resizable-handle">
  </div>
</div>
<script type="text/javascript">
  initNavTree('rf_8sql__in.html','');
</script>
<div id="doc-content">
<div class="header">
  <div class="headertitle">
<div class="title">rf.sql_in</div>  </div>
</div>
<div class="contents">
<a href="rf_8sql__in.html">Go to the documentation of this file.</a><div class="fragment"><pre class="fragment"><a name="l00001"></a>00001 <span class="comment">/* ----------------------------------------------------------------------- */</span><span class="comment">/** </span>
<a name="l00002"></a>00002 <span class="comment"> *</span>
<a name="l00003"></a>00003 <span class="comment"> * @file rf.sql_in</span>
<a name="l00004"></a>00004 <span class="comment"> *</span>
<a name="l00005"></a>00005 <span class="comment"> * @brief random forest APIs and main control logic written in PL/PGSQL</span>
<a name="l00006"></a>00006 <span class="comment"> * @date April 5, 2012</span>
<a name="l00007"></a>00007 <span class="comment"> *</span>
<a name="l00008"></a>00008 <span class="comment"> */</span><span class="comment">/* ----------------------------------------------------------------------- */</span>
<a name="l00009"></a>00009 
<a name="l00010"></a>00010 m4_include(`SQLCommon.m4<span class="stringliteral">&#39;)</span>
<a name="l00011"></a>00011 <span class="stringliteral"></span>
<a name="l00012"></a>00012 <span class="stringliteral">/* Own macro definitions */</span>
<a name="l00013"></a>00013 <span class="stringliteral">m4_ifelse(</span>
<a name="l00014"></a>00014 <span class="stringliteral">    m4_eval(</span>
<a name="l00015"></a>00015 <span class="stringliteral">        m4_ifdef(`__GREENPLUM__&#39;</span>, 1, 0) &amp;&amp;
<a name="l00016"></a>00016         __DBMS_VERSION_MAJOR__ * 100 + __DBMS_VERSION_MINOR__ &lt; 401
<a name="l00017"></a>00017     ), 1,
<a name="l00018"></a>00018     `m4_define(`__GREENPLUM_PRE_4_1__<span class="charliteral">&#39;)&#39;</span>
<a name="l00019"></a>00019 )
<a name="l00020"></a>00020 m4_ifelse(
<a name="l00021"></a>00021     m4_eval(
<a name="l00022"></a>00022         m4_ifdef(`__POSTGRESQL__<span class="stringliteral">&#39;, 1, 0) &amp;&amp;</span>
<a name="l00023"></a>00023 <span class="stringliteral">        __DBMS_VERSION_MAJOR__ &lt; 9</span>
<a name="l00024"></a>00024 <span class="stringliteral">    ), 1,</span>
<a name="l00025"></a>00025 <span class="stringliteral">    `m4_define(`__POSTGRESQL_PRE_9_0__&#39;</span>)<span class="stringliteral">&#39;</span>
<a name="l00026"></a>00026 <span class="stringliteral">)</span>
<a name="l00027"></a>00027 <span class="stringliteral"></span><span class="comment"></span>
<a name="l00028"></a>00028 <span class="comment">/**</span>
<a name="l00029"></a>00029 <span class="comment">@addtogroup grp_rf</span>
<a name="l00030"></a>00030 <span class="comment"></span>
<a name="l00031"></a>00031 <span class="comment">@about</span>
<a name="l00032"></a>00032 <span class="comment">A random forest (RF) is an ensemble classifier that consists of many decision </span>
<a name="l00033"></a>00033 <span class="comment">trees and outputs the class that is voted by the majority of the individual </span>
<a name="l00034"></a>00034 <span class="comment">trees.</span>
<a name="l00035"></a>00035 <span class="comment"></span>
<a name="l00036"></a>00036 <span class="comment">It has the following well-known advantages:</span>
<a name="l00037"></a>00037 <span class="comment">- Overall, RF produces better accuracy. </span>
<a name="l00038"></a>00038 <span class="comment">- It can be very efficient for large data sets. Trees of an RF can be </span>
<a name="l00039"></a>00039 <span class="comment">  trained in parallel.</span>
<a name="l00040"></a>00040 <span class="comment">- It can handle thousands of input attributes without attribute deletion.</span>
<a name="l00041"></a>00041 <span class="comment"></span>
<a name="l00042"></a>00042 <span class="comment">This module provides an implementation of the random forest algorithm </span>
<a name="l00043"></a>00043 <span class="comment">described in [1].</span>
<a name="l00044"></a>00044 <span class="comment"></span>
<a name="l00045"></a>00045 <span class="comment">The implementation supports:</span>
<a name="l00046"></a>00046 <span class="comment">- Building random forests</span>
<a name="l00047"></a>00047 <span class="comment">- Multiple split critera, including:</span>
<a name="l00048"></a>00048 <span class="comment">  . Information Gain</span>
<a name="l00049"></a>00049 <span class="comment">  . Gini Coefficient</span>
<a name="l00050"></a>00050 <span class="comment">  . Gain Ratio</span>
<a name="l00051"></a>00051 <span class="comment">- Random forest Classification/Scoring</span>
<a name="l00052"></a>00052 <span class="comment">- Random forest Display</span>
<a name="l00053"></a>00053 <span class="comment">- Continuous and Discrete features</span>
<a name="l00054"></a>00054 <span class="comment">- Equal frequency discretization for continuous features</span>
<a name="l00055"></a>00055 <span class="comment">- Missing value handling</span>
<a name="l00056"></a>00056 <span class="comment">- Sampling with replacement</span>
<a name="l00057"></a>00057 <span class="comment"></span>
<a name="l00058"></a>00058 <span class="comment">@input</span>
<a name="l00059"></a>00059 <span class="comment"></span>
<a name="l00060"></a>00060 <span class="comment">The &lt;b&gt;training data&lt;/b&gt; is expected to be of </span>
<a name="l00061"></a>00061 <span class="comment">the following form:</span>
<a name="l00062"></a>00062 <span class="comment">&lt;pre&gt;{TABLE|VIEW} &lt;em&gt;trainingSource&lt;/em&gt; (</span>
<a name="l00063"></a>00063 <span class="comment">    ...</span>
<a name="l00064"></a>00064 <span class="comment">    &lt;em&gt;id&lt;/em&gt; INT|BIGINT,</span>
<a name="l00065"></a>00065 <span class="comment">    &lt;em&gt;feature1&lt;/em&gt; SUPPORTED_DATA_TYPE,</span>
<a name="l00066"></a>00066 <span class="comment">    &lt;em&gt;feature2&lt;/em&gt; SUPPORTED_DATA_TYPE,</span>
<a name="l00067"></a>00067 <span class="comment">    &lt;em&gt;feature3&lt;/em&gt; SUPPORTED_DATA_TYPE,</span>
<a name="l00068"></a>00068 <span class="comment">    ....................</span>
<a name="l00069"></a>00069 <span class="comment">    &lt;em&gt;featureN&lt;/em&gt; SUPPORTED_DATA_TYPE,</span>
<a name="l00070"></a>00070 <span class="comment">    &lt;em&gt;class&lt;/em&gt;    SUPPORTED_DATA_TYPE,</span>
<a name="l00071"></a>00071 <span class="comment">    ...</span>
<a name="l00072"></a>00072 <span class="comment">)&lt;/pre&gt;</span>
<a name="l00073"></a>00073 <span class="comment"></span>
<a name="l00074"></a>00074 <span class="comment">The detailed list of SUPPORTED_DATA_TYPE is: </span>
<a name="l00075"></a>00075 <span class="comment">SMALLINT, INT, BIGINT, FLOAT8, REAL, </span>
<a name="l00076"></a>00076 <span class="comment">DECIMAL, INET, CIDR, MACADDR, BOOLEAN,</span>
<a name="l00077"></a>00077 <span class="comment">CHAR, VARCHAR, TEXT, &quot;char&quot;, </span>
<a name="l00078"></a>00078 <span class="comment">DATE, TIME, TIMETZ, TIMESTAMP, TIMESTAMPTZ, and INTERVAL.</span>
<a name="l00079"></a>00079 <span class="comment"></span>
<a name="l00080"></a>00080 <span class="comment">The &lt;b&gt;data to classify&lt;/b&gt; is expected to be </span>
<a name="l00081"></a>00081 <span class="comment">of the same form as &lt;b&gt;training data&lt;/b&gt;, except</span>
<a name="l00082"></a>00082 <span class="comment">that it does not need a class column.</span>
<a name="l00083"></a>00083 <span class="comment"></span>
<a name="l00084"></a>00084 <span class="comment">@usage</span>
<a name="l00085"></a>00085 <span class="comment"></span>
<a name="l00086"></a>00086 <span class="comment">- Run the training algorithm on the source data:</span>
<a name="l00087"></a>00087 <span class="comment">  &lt;pre&gt;SELECT * FROM \ref rf_train(</span>
<a name="l00088"></a>00088 <span class="comment">    &#39;&lt;em&gt;split_criterion&lt;/em&gt;&#39;,</span>
<a name="l00089"></a>00089 <span class="comment">    &#39;&lt;em&gt;training_table_name&lt;/em&gt;&#39;, </span>
<a name="l00090"></a>00090 <span class="comment">    &#39;&lt;em&gt;result_rf_table_name&lt;/em&gt;&#39;, </span>
<a name="l00091"></a>00091 <span class="comment">    &#39;&lt;em&gt;num_trees&lt;/em&gt;&#39;,</span>
<a name="l00092"></a>00092 <span class="comment">    &#39;&lt;em&gt;features_per_node&lt;/em&gt;&#39;,</span>
<a name="l00093"></a>00093 <span class="comment">    &#39;&lt;em&gt;sampling_percentage&lt;/em&gt;&#39;,</span>
<a name="l00094"></a>00094 <span class="comment">    &#39;&lt;em&gt;continuous_feature_names&lt;/em&gt;&#39;, </span>
<a name="l00095"></a>00095 <span class="comment">    &#39;&lt;em&gt;feature_col_names&lt;/em&gt;&#39;,</span>
<a name="l00096"></a>00096 <span class="comment">    &#39;&lt;em&gt;id_col_name&lt;/em&gt;&#39;,</span>
<a name="l00097"></a>00097 <span class="comment">    &#39;&lt;em&gt;class_col_name&lt;/em&gt;&#39;</span>
<a name="l00098"></a>00098 <span class="comment">    &#39;&lt;em&gt;how2handle_missing_value&lt;/em&gt;&#39;,</span>
<a name="l00099"></a>00099 <span class="comment">    &#39;&lt;em&gt;max_tree_depth&lt;/em&gt;&#39;,</span>
<a name="l00100"></a>00100 <span class="comment">    &#39;&lt;em&gt;node_prune_threshold&lt;/em&gt;&#39;,</span>
<a name="l00101"></a>00101 <span class="comment">    &#39;&lt;em&gt;node_split_threshold&lt;/em&gt;&#39;,</span>
<a name="l00102"></a>00102 <span class="comment">    &#39;&lt;em&gt;verbosity&lt;/em&gt;&#39;);</span>
<a name="l00103"></a>00103 <span class="comment">  &lt;/pre&gt;</span>
<a name="l00104"></a>00104 <span class="comment">  This will create the decision tree output table storing an abstract object</span>
<a name="l00105"></a>00105 <span class="comment">  (representing the model) used for further classification. Column names:</span>
<a name="l00106"></a>00106 <span class="comment">  &lt;pre&gt;    </span>
<a name="l00107"></a>00107 <span class="comment"> id | tree_location | feature |    probability    |    ebp_coeff     | maxclass |    split_gain     | live | cat_size | parent_id | lmc_nid | lmc_fval | is_feature_cont | split_value | tid | dp_ids </span>
<a name="l00108"></a>00108 <span class="comment">----+---------------+---------+-------------------+------------------+----------+-------------------+------+----------+-----------+---------+----------+-----------------+-------------+-----+--------</span>
<a name="l00109"></a>00109 <span class="comment">                                                     ...&lt;/pre&gt;    </span>
<a name="l00110"></a>00110 <span class="comment">    </span>
<a name="l00111"></a>00111 <span class="comment">- Run the classification function using the learned model: </span>
<a name="l00112"></a>00112 <span class="comment">  &lt;pre&gt;SELECT * FROM \ref rf_classify(</span>
<a name="l00113"></a>00113 <span class="comment">    &#39;&lt;em&gt;rf_table_name&lt;/em&gt;&#39;, </span>
<a name="l00114"></a>00114 <span class="comment">    &#39;&lt;em&gt;classification_table_name&lt;/em&gt;&#39;, </span>
<a name="l00115"></a>00115 <span class="comment">    &#39;&lt;em&gt;result_table_name&lt;/em&gt;&#39;);&lt;/pre&gt;</span>
<a name="l00116"></a>00116 <span class="comment">  This will create the result_table with the </span>
<a name="l00117"></a>00117 <span class="comment">  classification results. </span>
<a name="l00118"></a>00118 <span class="comment">  &lt;pre&gt; &lt;/pre&gt; </span>
<a name="l00119"></a>00119 <span class="comment"></span>
<a name="l00120"></a>00120 <span class="comment">- Run the scoring function to score the learned model against a validation data set:</span>
<a name="l00121"></a>00121 <span class="comment">  &lt;pre&gt;SELECT * FROM \ref rf_score(</span>
<a name="l00122"></a>00122 <span class="comment">    &#39;&lt;em&gt;rf_table_name&lt;/em&gt;&#39;,</span>
<a name="l00123"></a>00123 <span class="comment">    &#39;&lt;em&gt;validation_table_name&lt;/em&gt;&#39;,</span>
<a name="l00124"></a>00124 <span class="comment">    &#39;&lt;em&gt;verbosity&lt;/em&gt;&#39;);&lt;/pre&gt;</span>
<a name="l00125"></a>00125 <span class="comment">  This will give a ratio of correctly classified items in the validation set.</span>
<a name="l00126"></a>00126 <span class="comment">  &lt;pre&gt; &lt;/pre&gt;</span>
<a name="l00127"></a>00127 <span class="comment"></span>
<a name="l00128"></a>00128 <span class="comment">- Run the display tree function using the learned model: </span>
<a name="l00129"></a>00129 <span class="comment">  &lt;pre&gt;SELECT * FROM \ref rf_display(</span>
<a name="l00130"></a>00130 <span class="comment">    &#39;&lt;em&gt;rf_table_name&lt;/em&gt;&#39;);&lt;/pre&gt;</span>
<a name="l00131"></a>00131 <span class="comment">  This will display the trained trees in human readable format. </span>
<a name="l00132"></a>00132 <span class="comment">  &lt;pre&gt; &lt;/pre&gt; </span>
<a name="l00133"></a>00133 <span class="comment"></span>
<a name="l00134"></a>00134 <span class="comment">- Run the clean tree function as below: </span>
<a name="l00135"></a>00135 <span class="comment">  &lt;pre&gt;SELECT * FROM \ref rf_clean(</span>
<a name="l00136"></a>00136 <span class="comment">    &#39;&lt;em&gt;rf_table_name&lt;/em&gt;&#39;);&lt;/pre&gt;</span>
<a name="l00137"></a>00137 <span class="comment">  This will clean up the learned model and all metadata.</span>
<a name="l00138"></a>00138 <span class="comment">  &lt;pre&gt; &lt;/pre&gt; </span>
<a name="l00139"></a>00139 <span class="comment"></span>
<a name="l00140"></a>00140 <span class="comment">@examp</span>
<a name="l00141"></a>00141 <span class="comment"></span>
<a name="l00142"></a>00142 <span class="comment">-# Prepare an input table/view, e.g.:</span>
<a name="l00143"></a>00143 <span class="comment">\verbatim</span>
<a name="l00144"></a>00144 <span class="comment">sql&gt; select * from golf_data order by id;</span>
<a name="l00145"></a>00145 <span class="comment"> id | outlook  | temperature | humidity | windy  |    class     </span>
<a name="l00146"></a>00146 <span class="comment">----+----------+-------------+----------+--------+--------------</span>
<a name="l00147"></a>00147 <span class="comment">  1 | sunny    |          85 |       85 |  false |  Do not Play</span>
<a name="l00148"></a>00148 <span class="comment">  2 | sunny    |          80 |       90 |  true  |  Do not Play</span>
<a name="l00149"></a>00149 <span class="comment">  3 | overcast |          83 |       78 |  false |  Play</span>
<a name="l00150"></a>00150 <span class="comment">  4 | rain     |          70 |       96 |  false |  Play</span>
<a name="l00151"></a>00151 <span class="comment">  5 | rain     |          68 |       80 |  false |  Play</span>
<a name="l00152"></a>00152 <span class="comment">  6 | rain     |          65 |       70 |  true  |  Do not Play</span>
<a name="l00153"></a>00153 <span class="comment">  7 | overcast |          64 |       65 |  true  |  Play</span>
<a name="l00154"></a>00154 <span class="comment">  8 | sunny    |          72 |       95 |  false |  Do not Play</span>
<a name="l00155"></a>00155 <span class="comment">  9 | sunny    |          69 |       70 |  false |  Play</span>
<a name="l00156"></a>00156 <span class="comment"> 10 | rain     |          75 |       80 |  false |  Play</span>
<a name="l00157"></a>00157 <span class="comment"> 11 | sunny    |          75 |       70 |  true  |  Play</span>
<a name="l00158"></a>00158 <span class="comment"> 12 | overcast |          72 |       90 |  true  |  Play</span>
<a name="l00159"></a>00159 <span class="comment"> 13 | overcast |          81 |       75 |  false |  Play</span>
<a name="l00160"></a>00160 <span class="comment"> 14 | rain     |          71 |       80 |  true  |  Do not Play</span>
<a name="l00161"></a>00161 <span class="comment">(14 rows)</span>
<a name="l00162"></a>00162 <span class="comment">\endverbatim</span>
<a name="l00163"></a>00163 <span class="comment">-# Train the random forest, e.g.:</span>
<a name="l00164"></a>00164 <span class="comment">\verbatim</span>
<a name="l00165"></a>00165 <span class="comment">sql&gt; SELECT * FROM MADLIB_SCHEMA.rf_clean(&#39;trained_tree_infogain&#39;);</span>
<a name="l00166"></a>00166 <span class="comment">sql&gt; SELECT * FROM MADLIB_SCHEMA.rf_train(</span>
<a name="l00167"></a>00167 <span class="comment">       &#39;infogain&#39;,                           -- split criterion_name</span>
<a name="l00168"></a>00168 <span class="comment">       &#39;golf_data&#39;,                          -- input table name</span>
<a name="l00169"></a>00169 <span class="comment">       &#39;trained_tree_infogain&#39;,              -- result tree name</span>
<a name="l00170"></a>00170 <span class="comment">       10,                                   -- number of trees</span>
<a name="l00171"></a>00171 <span class="comment">       NULL,                                 -- features_per_node</span>
<a name="l00172"></a>00172 <span class="comment">       0.632,                                -- sampling_percentage</span>
<a name="l00173"></a>00173 <span class="comment">       &#39;temperature,humidity&#39;,               -- continuous feature names</span>
<a name="l00174"></a>00174 <span class="comment">       &#39;outlook,temperature,humidity,windy&#39;, -- feature column names</span>
<a name="l00175"></a>00175 <span class="comment">       &#39;id&#39;,                                 -- id column name</span>
<a name="l00176"></a>00176 <span class="comment">       &#39;class&#39;,                              -- class column name</span>
<a name="l00177"></a>00177 <span class="comment">       &#39;explicit&#39;,                           -- how to handle missing value</span>
<a name="l00178"></a>00178 <span class="comment">       10,                                   -- max tree depth</span>
<a name="l00179"></a>00179 <span class="comment">       0.0,                                  -- min percent mode</span>
<a name="l00180"></a>00180 <span class="comment">       0.0,                                  -- min percent split</span>
<a name="l00181"></a>00181 <span class="comment">       0                                     -- max split point</span>
<a name="l00182"></a>00182 <span class="comment">       0);                                   -- verbosity</span>
<a name="l00183"></a>00183 <span class="comment"> training_time  | num_of_samples | num_trees | features_per_node | num_tree_nodes | max_tree_depth | split_criterion |    acs_time     |    acc_time     |    olap_time    |   update_time   |    best_time    </span>
<a name="l00184"></a>00184 <span class="comment">----------------+--------------+-----------+-------------------+----------------+----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------</span>
<a name="l00185"></a>00185 <span class="comment"> 00:00:03.60498 |           14 |        10 |                 3 |             71 |              6 | infogain        | 00:00:00.154991 | 00:00:00.404411 | 00:00:00.736876 | 00:00:00.374084 | 00:00:01.722658</span>
<a name="l00186"></a>00186 <span class="comment">(1 row)</span>
<a name="l00187"></a>00187 <span class="comment">\endverbatim</span>
<a name="l00188"></a>00188 <span class="comment">-# Check the table records that keep the random forest:</span>
<a name="l00189"></a>00189 <span class="comment">\verbatim</span>
<a name="l00190"></a>00190 <span class="comment">sql&gt; select * from golf_tree order by tid,id;</span>
<a name="l00191"></a>00191 <span class="comment"> id | tree_location | feature |    probability    | ebp_coeff | maxclass |     split_gain     | live | cat_size | parent_id | lmc_nid | lmc_fval | is_feature_cont | split_value | tid | dp_ids </span>
<a name="l00192"></a>00192 <span class="comment">----+---------------+---------+-------------------+-----------+----------+--------------------+------+----------+-----------+---------+----------+-----------------+-------------+-----+--------</span>
<a name="l00193"></a>00193 <span class="comment">  1 | {0}           |       3 | 0.777777777777778 |         1 |        2 |  0.197530864197531 |    0 |        9 |         0 |      24 |        1 | f               |             |   1 | </span>
<a name="l00194"></a>00194 <span class="comment"> 24 | {0,1}         |       4 |                 1 |         1 |        2 |                  0 |    0 |        4 |         1 |         |          | f               |             |   1 | {3}</span>
<a name="l00195"></a>00195 <span class="comment"> 25 | {0,2}         |       4 |                 1 |         1 |        2 |                  0 |    0 |        2 |         1 |         |          | f               |             |   1 | {3}</span>
<a name="l00196"></a>00196 <span class="comment"> 26 | {0,3}         |       2 | 0.666666666666667 |         1 |        1 |  0.444444444444444 |    0 |        3 |         1 |      42 |        1 | t               |          70 |   1 | {3}</span>
<a name="l00197"></a>00197 <span class="comment"> 42 | {0,3,1}       |       4 |                 1 |         1 |        2 |                  0 |    0 |        1 |        26 |         |          | f               |             |   1 | </span>
<a name="l00198"></a>00198 <span class="comment"> 43 | {0,3,2}       |       4 |                 1 |         1 |        1 |                  0 |    0 |        2 |        26 |         |          | f               |             |   1 | </span>
<a name="l00199"></a>00199 <span class="comment">  2 | {0}           |       2 | 0.555555555555556 |         1 |        1 |   0.17636684303351 |    0 |        9 |         0 |      11 |        1 | t               |          65 |   2 | </span>
<a name="l00200"></a>00200 <span class="comment"> 11 | {0,1}         |       4 |                 1 |         1 |        2 |                  0 |    0 |        2 |         2 |         |          | f               |             |   2 | </span>
<a name="l00201"></a>00201 <span class="comment"> 12 | {0,2}         |       4 | 0.714285714285714 |         1 |        1 |  0.217687074829932 |    0 |        7 |         2 |      44 |        1 | f               |             |   2 | </span>
<a name="l00202"></a>00202 <span class="comment"> 44 | {0,2,1}       |       3 | 0.666666666666667 |         1 |        2 |  0.444444444444444 |    0 |        3 |        12 |      57 |        1 | f               |             |   2 | {4}</span>
<a name="l00203"></a>00203 <span class="comment"> 45 | {0,2,2}       |       3 |                 1 |         1 |        1 |                  0 |    0 |        4 |        12 |         |          | f               |             |   2 | {4}</span>
<a name="l00204"></a>00204 <span class="comment"> 57 | {0,2,1,1}     |       2 |                 1 |         1 |        2 |                  0 |    0 |        1 |        44 |         |          | t               |          78 |   2 | {4,3}</span>
<a name="l00205"></a>00205 <span class="comment"> 58 | {0,2,1,2}     |       2 |                 1 |         1 |        2 |                  0 |    0 |        1 |        44 |         |          | t               |          96 |   2 | {4,3}</span>
<a name="l00206"></a>00206 <span class="comment"> 59 | {0,2,1,3}     |       2 |                 1 |         1 |        1 |                  0 |    0 |        1 |        44 |         |          | t               |          85 |   2 | {4,3}</span>
<a name="l00207"></a>00207 <span class="comment">  3 | {0}           |       2 | 0.777777777777778 |         1 |        2 |  0.197530864197531 |    0 |        9 |         0 |      27 |        1 | t               |          80 |   3 | </span>
<a name="l00208"></a>00208 <span class="comment"> 27 | {0,1}         |       4 |                 1 |         1 |        2 |                  0 |    0 |        6 |         3 |         |          | f               |             |   3 | </span>
<a name="l00209"></a>00209 <span class="comment"> 28 | {0,2}         |       2 | 0.666666666666667 |         1 |        1 |  0.444444444444444 |    0 |        3 |         3 |      46 |        1 | t               |          90 |   3 | </span>
<a name="l00210"></a>00210 <span class="comment"> 46 | {0,2,1}       |       4 |                 1 |         1 |        1 |                  0 |    0 |        2 |        28 |         |          | f               |             |   3 | </span>
<a name="l00211"></a>00211 <span class="comment"> 47 | {0,2,2}       |       4 |                 1 |         1 |        2 |                  0 |    0 |        1 |        28 |         |          | f               |             |   3 | </span>
<a name="l00212"></a>00212 <span class="comment">  4 | {0}           |       4 | 0.888888888888889 |         1 |        2 | 0.0493827160493827 |    0 |        9 |         0 |      13 |        1 | f               |             |   4 | </span>
<a name="l00213"></a>00213 <span class="comment"> 13 | {0,1}         |       3 |                 1 |         1 |        2 |                  0 |    0 |        6 |         4 |         |          | f               |             |   4 | {4}</span>
<a name="l00214"></a>00214 <span class="comment"> 14 | {0,2}         |       3 | 0.666666666666667 |         1 |        2 |  0.444444444444444 |    0 |        3 |         4 |      48 |        1 | f               |             |   4 | {4}</span>
<a name="l00215"></a>00215 <span class="comment"> 48 | {0,2,1}       |       2 |                 1 |         1 |        2 |                  0 |    0 |        2 |        14 |         |          | t               |          90 |   4 | {4,3}</span>
<a name="l00216"></a>00216 <span class="comment"> 49 | {0,2,2}       |       2 |                 1 |         1 |        1 |                  0 |    0 |        1 |        14 |         |          | t               |          80 |   4 | {4,3}</span>
<a name="l00217"></a>00217 <span class="comment">  5 | {0}           |       2 | 0.888888888888889 |         1 |        2 |  0.197530864197531 |    0 |        9 |         0 |      29 |        1 | t               |          90 |   5 | </span>
<a name="l00218"></a>00218 <span class="comment"> 29 | {0,1}         |       4 |                 1 |         1 |        2 |                  0 |    0 |        8 |         5 |         |          | f               |             |   5 | </span>
<a name="l00219"></a>00219 <span class="comment"> 30 | {0,2}         |       3 |                 1 |         1 |        1 |                  0 |    0 |        1 |         5 |         |          | f               |             |   5 | </span>
<a name="l00220"></a>00220 <span class="comment">  6 | {0}           |       3 | 0.555555555555556 |         1 |        2 |  0.345679012345679 |    0 |        9 |         0 |      15 |        1 | f               |             |   6 | </span>
<a name="l00221"></a>00221 <span class="comment"> 15 | {0,1}         |       4 |                 1 |         1 |        2 |                  0 |    0 |        3 |         6 |         |          | f               |             |   6 | {3}</span>
<a name="l00222"></a>00222 <span class="comment"> 16 | {0,2}         |       4 | 0.666666666666667 |         1 |        2 |  0.444444444444444 |    0 |        3 |         6 |      51 |        1 | f               |             |   6 | {3}</span>
<a name="l00223"></a>00223 <span class="comment"> 17 | {0,3}         |       4 |                 1 |         1 |        1 |                  0 |    0 |        3 |         6 |         |          | f               |             |   6 | {3}</span>
<a name="l00224"></a>00224 <span class="comment"> 51 | {0,2,1}       |       2 |                 1 |         1 |        2 |                  0 |    0 |        2 |        16 |         |          | t               |          96 |   6 | {3,4}</span>
<a name="l00225"></a>00225 <span class="comment"> 52 | {0,2,2}       |       2 |                 1 |         1 |        1 |                  0 |    0 |        1 |        16 |         |          | t               |          70 |   6 | {3,4}</span>
<a name="l00226"></a>00226 <span class="comment">  7 | {0}           |       4 | 0.666666666666667 |         1 |        2 |  0.253968253968254 |    0 |        9 |         0 |      31 |        1 | f               |             |   7 | </span>
<a name="l00227"></a>00227 <span class="comment"> 31 | {0,1}         |       2 | 0.857142857142857 |         1 |        2 |  0.102040816326531 |    0 |        7 |         7 |      36 |        1 | t               |          80 |   7 | {4}</span>
<a name="l00228"></a>00228 <span class="comment"> 32 | {0,2}         |       3 |                 1 |         1 |        1 |                  0 |    0 |        2 |         7 |         |          | f               |             |   7 | {4}</span>
<a name="l00229"></a>00229 <span class="comment"> 36 | {0,1,1}       |       4 |                 1 |         1 |        2 |                  0 |    0 |        5 |        31 |         |          | f               |             |   7 | </span>
<a name="l00230"></a>00230 <span class="comment"> 37 | {0,1,2}       |       2 |               0.5 |         1 |        2 |                0.5 |    0 |        2 |        31 |      60 |        1 | t               |          95 |   7 | </span>
<a name="l00231"></a>00231 <span class="comment"> 60 | {0,1,2,1}     |       4 |                 1 |         1 |        1 |                  0 |    0 |        1 |        37 |         |          | f               |             |   7 | </span>
<a name="l00232"></a>00232 <span class="comment"> 61 | {0,1,2,2}     |       4 |                 1 |         1 |        2 |                  0 |    0 |        1 |        37 |         |          | f               |             |   7 | </span>
<a name="l00233"></a>00233 <span class="comment">  8 | {0}           |       3 | 0.777777777777778 |         1 |        2 | 0.0864197530864197 |    0 |        9 |         0 |      18 |        1 | f               |             |   8 | </span>
<a name="l00234"></a>00234 <span class="comment"> 18 | {0,1}         |       4 |                 1 |         1 |        2 |                  0 |    0 |        4 |         8 |         |          | f               |             |   8 | {3}</span>
<a name="l00235"></a>00235 <span class="comment"> 19 | {0,2}         |       4 | 0.666666666666667 |         1 |        2 |  0.444444444444444 |    0 |        3 |         8 |      38 |        1 | f               |             |   8 | {3}</span>
<a name="l00236"></a>00236 <span class="comment"> 20 | {0,3}         |       2 |               0.5 |         1 |        2 |                0.5 |    0 |        2 |         8 |      53 |        1 | t               |          70 |   8 | {3}</span>
<a name="l00237"></a>00237 <span class="comment"> 38 | {0,2,1}       |       2 |                 1 |         1 |        2 |                  0 |    0 |        2 |        19 |         |          | t               |          80 |   8 | {3,4}</span>
<a name="l00238"></a>00238 <span class="comment"> 39 | {0,2,2}       |       2 |                 1 |         1 |        1 |                  0 |    0 |        1 |        19 |         |          | t               |          80 |   8 | {3,4}</span>
<a name="l00239"></a>00239 <span class="comment"> 53 | {0,3,1}       |       4 |                 1 |         1 |        2 |                  0 |    0 |        1 |        20 |         |          | f               |             |   8 | </span>
<a name="l00240"></a>00240 <span class="comment"> 54 | {0,3,2}       |       4 |                 1 |         1 |        1 |                  0 |    0 |        1 |        20 |         |          | f               |             |   8 | </span>
<a name="l00241"></a>00241 <span class="comment">  9 | {0}           |       3 | 0.555555555555556 |         1 |        2 |  0.327160493827161 |    0 |        9 |         0 |      33 |        1 | f               |             |   9 | </span>
<a name="l00242"></a>00242 <span class="comment"> 33 | {0,1}         |       4 |                 1 |         1 |        2 |                  0 |    0 |        2 |         9 |         |          | f               |             |   9 | {3}</span>
<a name="l00243"></a>00243 <span class="comment"> 34 | {0,2}         |       4 |              0.75 |         1 |        2 |              0.375 |    0 |        4 |         9 |      55 |        1 | f               |             |   9 | {3}</span>
<a name="l00244"></a>00244 <span class="comment"> 35 | {0,3}         |       4 |                 1 |         1 |        1 |                  0 |    0 |        3 |         9 |         |          | f               |             |   9 | {3}</span>
<a name="l00245"></a>00245 <span class="comment"> 55 | {0,2,1}       |       2 |                 1 |         1 |        2 |                  0 |    0 |        3 |        34 |         |          | t               |          96 |   9 | {3,4}</span>
<a name="l00246"></a>00246 <span class="comment"> 56 | {0,2,2}       |       2 |                 1 |         1 |        1 |                  0 |    0 |        1 |        34 |         |          | t               |          70 |   9 | {3,4}</span>
<a name="l00247"></a>00247 <span class="comment"> 10 | {0}           |       3 | 0.666666666666667 |         1 |        2 |  0.277777777777778 |    0 |        9 |         0 |      21 |        1 | f               |             |  10 | </span>
<a name="l00248"></a>00248 <span class="comment"> 21 | {0,1}         |       4 |                 1 |         1 |        2 |                  0 |    0 |        1 |        10 |         |          | f               |             |  10 | {3}</span>
<a name="l00249"></a>00249 <span class="comment"> 22 | {0,2}         |       4 |                 1 |         1 |        2 |                  0 |    0 |        4 |        10 |         |          | f               |             |  10 | {3}</span>
<a name="l00250"></a>00250 <span class="comment"> 23 | {0,3}         |       2 |              0.75 |         1 |        1 |              0.375 |    0 |        4 |        10 |      40 |        1 | t               |          70 |  10 | {3}</span>
<a name="l00251"></a>00251 <span class="comment"> 40 | {0,3,1}       |       4 |                 1 |         1 |        2 |                  0 |    0 |        1 |        23 |         |          | f               |             |  10 | </span>
<a name="l00252"></a>00252 <span class="comment"> 41 | {0,3,2}       |       4 |                 1 |         1 |        1 |                  0 |    0 |        3 |        23 |         |          | f               |             |  10 | </span>
<a name="l00253"></a>00253 <span class="comment">(60 rows)</span>
<a name="l00254"></a>00254 <span class="comment">\endverbatim</span>
<a name="l00255"></a>00255 <span class="comment">-# To display the random forest with human readable format:</span>
<a name="l00256"></a>00256 <span class="comment">\verbatim</span>
<a name="l00257"></a>00257 <span class="comment">sql&gt; select * from MADLIB_SCHEMA.rf_display(&#39;trained_tree_infogain&#39;);</span>
<a name="l00258"></a>00258 <span class="comment">                                             rf_display                                              </span>
<a name="l00259"></a>00259 <span class="comment">-----------------------------------------------------------------------------------------------------</span>
<a name="l00260"></a>00260 <span class="comment">                                                                                                     </span>
<a name="l00261"></a>00261 <span class="comment"> Tree 1                                                                                              </span>
<a name="l00262"></a>00262 <span class="comment">     Root Node  : class( Play)   num_elements(9)  predict_prob(0.777777777777778)                    </span>
<a name="l00263"></a>00263 <span class="comment">         outlook:  = overcast  : class( Play)   num_elements(4)  predict_prob(1)                     </span>
<a name="l00264"></a>00264 <span class="comment">         outlook:  = rain  : class( Play)   num_elements(2)  predict_prob(1)                         </span>
<a name="l00265"></a>00265 <span class="comment">         outlook:  = sunny  : class( Do not Play)   num_elements(3)  predict_prob(0.666666666666667) </span>
<a name="l00266"></a>00266 <span class="comment">             humidity:  &lt;= 70  : class( Play)   num_elements(1)  predict_prob(1)                     </span>
<a name="l00267"></a>00267 <span class="comment">             humidity:  &gt; 70  : class( Do not Play)   num_elements(2)  predict_prob(1)               </span>
<a name="l00268"></a>00268 <span class="comment"> </span>
<a name="l00269"></a>00269 <span class="comment">                                                                                                     </span>
<a name="l00270"></a>00270 <span class="comment"> Tree 2                                                                                              </span>
<a name="l00271"></a>00271 <span class="comment">     Root Node  : class( Do not Play)   num_elements(9)  predict_prob(0.555555555555556)             </span>
<a name="l00272"></a>00272 <span class="comment">         humidity:  &lt;= 65  : class( Play)   num_elements(2)  predict_prob(1)                         </span>
<a name="l00273"></a>00273 <span class="comment">         humidity:  &gt; 65  : class( Do not Play)   num_elements(7)  predict_prob(0.714285714285714)   </span>
<a name="l00274"></a>00274 <span class="comment">             windy:  =  false  : class( Play)   num_elements(3)  predict_prob(0.666666666666667)     </span>
<a name="l00275"></a>00275 <span class="comment">                 outlook:  = overcast  : class( Play)   num_elements(1)  predict_prob(1)             </span>
<a name="l00276"></a>00276 <span class="comment">                 outlook:  = rain  : class( Play)   num_elements(1)  predict_prob(1)                 </span>
<a name="l00277"></a>00277 <span class="comment">                 outlook:  = sunny  : class( Do not Play)   num_elements(1)  predict_prob(1)         </span>
<a name="l00278"></a>00278 <span class="comment">             windy:  =  true  : class( Do not Play)   num_elements(4)  predict_prob(1)               </span>
<a name="l00279"></a>00279 <span class="comment"> </span>
<a name="l00280"></a>00280 <span class="comment">                                                                                                     </span>
<a name="l00281"></a>00281 <span class="comment"> Tree 3                                                                                              </span>
<a name="l00282"></a>00282 <span class="comment">     Root Node  : class( Play)   num_elements(9)  predict_prob(0.777777777777778)                    </span>
<a name="l00283"></a>00283 <span class="comment">         humidity:  &lt;= 80  : class( Play)   num_elements(6)  predict_prob(1)                         </span>
<a name="l00284"></a>00284 <span class="comment">         humidity:  &gt; 80  : class( Do not Play)   num_elements(3)  predict_prob(0.666666666666667)   </span>
<a name="l00285"></a>00285 <span class="comment">             humidity:  &lt;= 90  : class( Do not Play)   num_elements(2)  predict_prob(1)              </span>
<a name="l00286"></a>00286 <span class="comment">             humidity:  &gt; 90  : class( Play)   num_elements(1)  predict_prob(1)                      </span>
<a name="l00287"></a>00287 <span class="comment"> </span>
<a name="l00288"></a>00288 <span class="comment">                                                                                                     </span>
<a name="l00289"></a>00289 <span class="comment"> Tree 4                                                                                              </span>
<a name="l00290"></a>00290 <span class="comment">     Root Node  : class( Play)   num_elements(9)  predict_prob(0.888888888888889)                    </span>
<a name="l00291"></a>00291 <span class="comment">         windy:  =  false  : class( Play)   num_elements(6)  predict_prob(1)                         </span>
<a name="l00292"></a>00292 <span class="comment">         windy:  =  true  : class( Play)   num_elements(3)  predict_prob(0.666666666666667)          </span>
<a name="l00293"></a>00293 <span class="comment">             outlook:  = overcast  : class( Play)   num_elements(2)  predict_prob(1)                 </span>
<a name="l00294"></a>00294 <span class="comment">             outlook:  = rain  : class( Do not Play)   num_elements(1)  predict_prob(1)              </span>
<a name="l00295"></a>00295 <span class="comment"> </span>
<a name="l00296"></a>00296 <span class="comment">                                                                                                     </span>
<a name="l00297"></a>00297 <span class="comment"> Tree 5                                                                                              </span>
<a name="l00298"></a>00298 <span class="comment">     Root Node  : class( Play)   num_elements(9)  predict_prob(0.888888888888889)                    </span>
<a name="l00299"></a>00299 <span class="comment">         humidity:  &lt;= 90  : class( Play)   num_elements(8)  predict_prob(1)                         </span>
<a name="l00300"></a>00300 <span class="comment">         humidity:  &gt; 90  : class( Do not Play)   num_elements(1)  predict_prob(1)                   </span>
<a name="l00301"></a>00301 <span class="comment"> </span>
<a name="l00302"></a>00302 <span class="comment">                                                                                                     </span>
<a name="l00303"></a>00303 <span class="comment"> Tree 6                                                                                              </span>
<a name="l00304"></a>00304 <span class="comment">     Root Node  : class( Play)   num_elements(9)  predict_prob(0.555555555555556)                    </span>
<a name="l00305"></a>00305 <span class="comment">         outlook:  = overcast  : class( Play)   num_elements(3)  predict_prob(1)                     </span>
<a name="l00306"></a>00306 <span class="comment">         outlook:  = rain  : class( Play)   num_elements(3)  predict_prob(0.666666666666667)         </span>
<a name="l00307"></a>00307 <span class="comment">             windy:  =  false  : class( Play)   num_elements(2)  predict_prob(1)                     </span>
<a name="l00308"></a>00308 <span class="comment">             windy:  =  true  : class( Do not Play)   num_elements(1)  predict_prob(1)               </span>
<a name="l00309"></a>00309 <span class="comment">         outlook:  = sunny  : class( Do not Play)   num_elements(3)  predict_prob(1)                 </span>
<a name="l00310"></a>00310 <span class="comment"> </span>
<a name="l00311"></a>00311 <span class="comment">                                                                                                     </span>
<a name="l00312"></a>00312 <span class="comment"> Tree 7                                                                                              </span>
<a name="l00313"></a>00313 <span class="comment">     Root Node  : class( Play)   num_elements(9)  predict_prob(0.666666666666667)                    </span>
<a name="l00314"></a>00314 <span class="comment">         windy:  =  false  : class( Play)   num_elements(7)  predict_prob(0.857142857142857)         </span>
<a name="l00315"></a>00315 <span class="comment">             humidity:  &lt;= 80  : class( Play)   num_elements(5)  predict_prob(1)                     </span>
<a name="l00316"></a>00316 <span class="comment">             humidity:  &gt; 80  : class( Play)   num_elements(2)  predict_prob(0.5)                    </span>
<a name="l00317"></a>00317 <span class="comment">                 humidity:  &lt;= 95  : class( Do not Play)   num_elements(1)  predict_prob(1)          </span>
<a name="l00318"></a>00318 <span class="comment">                 humidity:  &gt; 95  : class( Play)   num_elements(1)  predict_prob(1)                  </span>
<a name="l00319"></a>00319 <span class="comment">         windy:  =  true  : class( Do not Play)   num_elements(2)  predict_prob(1)                   </span>
<a name="l00320"></a>00320 <span class="comment"> </span>
<a name="l00321"></a>00321 <span class="comment">                                                                                                     </span>
<a name="l00322"></a>00322 <span class="comment"> Tree 8                                                                                              </span>
<a name="l00323"></a>00323 <span class="comment">     Root Node  : class( Play)   num_elements(9)  predict_prob(0.777777777777778)                    </span>
<a name="l00324"></a>00324 <span class="comment">         outlook:  = overcast  : class( Play)   num_elements(4)  predict_prob(1)                     </span>
<a name="l00325"></a>00325 <span class="comment">         outlook:  = rain  : class( Play)   num_elements(3)  predict_prob(0.666666666666667)         </span>
<a name="l00326"></a>00326 <span class="comment">             windy:  =  false  : class( Play)   num_elements(2)  predict_prob(1)                     </span>
<a name="l00327"></a>00327 <span class="comment">             windy:  =  true  : class( Do not Play)   num_elements(1)  predict_prob(1)               </span>
<a name="l00328"></a>00328 <span class="comment">         outlook:  = sunny  : class( Play)   num_elements(2)  predict_prob(0.5)                      </span>
<a name="l00329"></a>00329 <span class="comment">             humidity:  &lt;= 70  : class( Play)   num_elements(1)  predict_prob(1)                     </span>
<a name="l00330"></a>00330 <span class="comment">             humidity:  &gt; 70  : class( Do not Play)   num_elements(1)  predict_prob(1)               </span>
<a name="l00331"></a>00331 <span class="comment"> </span>
<a name="l00332"></a>00332 <span class="comment">                                                                                                     </span>
<a name="l00333"></a>00333 <span class="comment"> Tree 9                                                                                              </span>
<a name="l00334"></a>00334 <span class="comment">     Root Node  : class( Play)   num_elements(9)  predict_prob(0.555555555555556)                    </span>
<a name="l00335"></a>00335 <span class="comment">         outlook:  = overcast  : class( Play)   num_elements(2)  predict_prob(1)                     </span>
<a name="l00336"></a>00336 <span class="comment">         outlook:  = rain  : class( Play)   num_elements(4)  predict_prob(0.75)                      </span>
<a name="l00337"></a>00337 <span class="comment">             windy:  =  false  : class( Play)   num_elements(3)  predict_prob(1)                     </span>
<a name="l00338"></a>00338 <span class="comment">             windy:  =  true  : class( Do not Play)   num_elements(1)  predict_prob(1)               </span>
<a name="l00339"></a>00339 <span class="comment">         outlook:  = sunny  : class( Do not Play)   num_elements(3)  predict_prob(1)                 </span>
<a name="l00340"></a>00340 <span class="comment"> </span>
<a name="l00341"></a>00341 <span class="comment">                                                                                                     </span>
<a name="l00342"></a>00342 <span class="comment"> Tree 10                                                                                             </span>
<a name="l00343"></a>00343 <span class="comment">     Root Node  : class( Play)   num_elements(9)  predict_prob(0.666666666666667)                    </span>
<a name="l00344"></a>00344 <span class="comment">         outlook:  = overcast  : class( Play)   num_elements(1)  predict_prob(1)                     </span>
<a name="l00345"></a>00345 <span class="comment">         outlook:  = rain  : class( Play)   num_elements(4)  predict_prob(1)                         </span>
<a name="l00346"></a>00346 <span class="comment">         outlook:  = sunny  : class( Do not Play)   num_elements(4)  predict_prob(0.75)              </span>
<a name="l00347"></a>00347 <span class="comment">             humidity:  &lt;= 70  : class( Play)   num_elements(1)  predict_prob(1)                     </span>
<a name="l00348"></a>00348 <span class="comment">             humidity:  &gt; 70  : class( Do not Play)   num_elements(3)  predict_prob(1)               </span>
<a name="l00349"></a>00349 <span class="comment"> </span>
<a name="l00350"></a>00350 <span class="comment">(10 rows)</span>
<a name="l00351"></a>00351 <span class="comment">\endverbatim</span>
<a name="l00352"></a>00352 <span class="comment">-# To classify data with the learned model:</span>
<a name="l00353"></a>00353 <span class="comment">\verbatim</span>
<a name="l00354"></a>00354 <span class="comment">sql&gt; select * from MADLIB_SCHEMA.rf_classify(</span>
<a name="l00355"></a>00355 <span class="comment">         &#39;trained_tree_infogain&#39;,  -- name of the trained model</span>
<a name="l00356"></a>00356 <span class="comment">         &#39;golf_data&#39;,              -- name of the table containing data to classify</span>
<a name="l00357"></a>00357 <span class="comment">         &#39;classification_result&#39;); -- name of the output table</span>
<a name="l00358"></a>00358 <span class="comment"> input_set_size | classification_time </span>
<a name="l00359"></a>00359 <span class="comment">----------------+---------------------</span>
<a name="l00360"></a>00360 <span class="comment">             14 | 00:00:02.215017</span>
<a name="l00361"></a>00361 <span class="comment">(1 row)</span>
<a name="l00362"></a>00362 <span class="comment">\endverbatim</span>
<a name="l00363"></a>00363 <span class="comment">-# Check classification results: </span>
<a name="l00364"></a>00364 <span class="comment">\verbatim</span>
<a name="l00365"></a>00365 <span class="comment">sql&gt; select t.id,t.outlook,t.temperature,t.humidity,t.windy,c.class from</span>
<a name="l00366"></a>00366 <span class="comment">    classification_result c,golf_data t where t.id=c.id order by id;</span>
<a name="l00367"></a>00367 <span class="comment"> id | outlook  | temperature | humidity | windy  |    class     </span>
<a name="l00368"></a>00368 <span class="comment">----+----------+-------------+----------+--------+--------------</span>
<a name="l00369"></a>00369 <span class="comment">  1 | sunny    |          85 |       85 |  false |  Do not Play</span>
<a name="l00370"></a>00370 <span class="comment">  2 | sunny    |          80 |       90 |  true  |  Do not Play</span>
<a name="l00371"></a>00371 <span class="comment">  3 | overcast |          83 |       78 |  false |  Play</span>
<a name="l00372"></a>00372 <span class="comment">  4 | rain     |          70 |       96 |  false |  Play</span>
<a name="l00373"></a>00373 <span class="comment">  5 | rain     |          68 |       80 |  false |  Play</span>
<a name="l00374"></a>00374 <span class="comment">  6 | rain     |          65 |       70 |  true  |  Do not Play</span>
<a name="l00375"></a>00375 <span class="comment">  7 | overcast |          64 |       65 |  true  |  Play</span>
<a name="l00376"></a>00376 <span class="comment">  8 | sunny    |          72 |       95 |  false |  Do not Play</span>
<a name="l00377"></a>00377 <span class="comment">  9 | sunny    |          69 |       70 |  false |  Play</span>
<a name="l00378"></a>00378 <span class="comment"> 10 | rain     |          75 |       80 |  false |  Play</span>
<a name="l00379"></a>00379 <span class="comment"> 11 | sunny    |          75 |       70 |  true  |  Do not Play</span>
<a name="l00380"></a>00380 <span class="comment"> 12 | overcast |          72 |       90 |  true  |  Play</span>
<a name="l00381"></a>00381 <span class="comment"> 13 | overcast |          81 |       75 |  false |  Play</span>
<a name="l00382"></a>00382 <span class="comment"> 14 | rain     |          71 |       80 |  true  |  Do not Play</span>
<a name="l00383"></a>00383 <span class="comment">(14 rows)</span>
<a name="l00384"></a>00384 <span class="comment">\endverbatim</span>
<a name="l00385"></a>00385 <span class="comment">-# Score the data against a validation set:</span>
<a name="l00386"></a>00386 <span class="comment">\verbatim</span>
<a name="l00387"></a>00387 <span class="comment">sql&gt; select * from MADLIB_SCHEMA.rf_score(</span>
<a name="l00388"></a>00388 <span class="comment">        &#39;trained_tree_infogain&#39;,</span>
<a name="l00389"></a>00389 <span class="comment">        &#39;golf_data_validation&#39;,</span>
<a name="l00390"></a>00390 <span class="comment">        0);</span>
<a name="l00391"></a>00391 <span class="comment">     rf_score      </span>
<a name="l00392"></a>00392 <span class="comment">-------------------</span>
<a name="l00393"></a>00393 <span class="comment"> 0.928571428571429</span>
<a name="l00394"></a>00394 <span class="comment">(1 row)</span>
<a name="l00395"></a>00395 <span class="comment">\endverbatim</span>
<a name="l00396"></a>00396 <span class="comment">-# Clean up the random forest and other auxiliary information:</span>
<a name="l00397"></a>00397 <span class="comment">\verbatim</span>
<a name="l00398"></a>00398 <span class="comment">testdb=# select MADLIB_SCHEMA.rf_clean(&#39;trained_tree_infogain&#39;);</span>
<a name="l00399"></a>00399 <span class="comment"> rf_clean </span>
<a name="l00400"></a>00400 <span class="comment">----------</span>
<a name="l00401"></a>00401 <span class="comment"> t</span>
<a name="l00402"></a>00402 <span class="comment">(1 row)</span>
<a name="l00403"></a>00403 <span class="comment">\endverbatim</span>
<a name="l00404"></a>00404 <span class="comment"></span>
<a name="l00405"></a>00405 <span class="comment">@literature</span>
<a name="l00406"></a>00406 <span class="comment"></span>
<a name="l00407"></a>00407 <span class="comment">[1] http://www.stat.berkeley.edu/~breiman/RandomForests/cc_home.htm</span>
<a name="l00408"></a>00408 <span class="comment"></span>
<a name="l00409"></a>00409 <span class="comment">[2] http://en.wikipedia.org/wiki/Discretization_of_continuous_features</span>
<a name="l00410"></a>00410 <span class="comment"></span>
<a name="l00411"></a>00411 <span class="comment">@sa File rf.sql_in documenting the SQL functions.</span>
<a name="l00412"></a>00412 <span class="comment">*/</span>
<a name="l00413"></a>00413 
<a name="l00414"></a>00414 /*
<a name="l00415"></a>00415  * This structure is used to store the results for the function of rf_train.
<a name="l00416"></a>00416  *
<a name="l00417"></a>00417  * training_time      The total training time.
<a name="l00418"></a>00418  * num_of_samples       How many records there exist in the training set.   
<a name="l00419"></a>00419  * num_trees          The number of trees to be grown.
<a name="l00420"></a>00420  * features_per_node  The number of features chosen for each node.
<a name="l00421"></a>00421  * num_tree_nodes     The number of nodes in the resulting RF.
<a name="l00422"></a>00422  * max_tree_depth     The depth of the deepest trained tree.
<a name="l00423"></a>00423  * split_criterion    The split criterion used to train the RF.
<a name="l00424"></a>00424  *
<a name="l00425"></a>00425  */
<a name="l00426"></a>00426 DROP TYPE IF EXISTS MADLIB_SCHEMA.rf_train_result;
<a name="l00427"></a>00427 CREATE TYPE MADLIB_SCHEMA.rf_train_result AS 
<a name="l00428"></a>00428 (   
<a name="l00429"></a>00429     training_time            INTERVAL,
<a name="l00430"></a>00430     num_of_samples           BIGINT,   
<a name="l00431"></a>00431     num_trees                INT,
<a name="l00432"></a>00432     features_per_node        INT,
<a name="l00433"></a>00433     num_tree_nodes           INT,
<a name="l00434"></a>00434     max_tree_depth           INT,
<a name="l00435"></a>00435     split_criterion          TEXT
<a name="l00436"></a>00436 );
<a name="l00437"></a>00437 
<a name="l00438"></a>00438 
<a name="l00439"></a>00439 /*
<a name="l00440"></a>00440  * This structure is used to store the results for the function of rf_classify.
<a name="l00441"></a>00441  *
<a name="l00442"></a>00442  * input_set_size         How many records there exist in 
<a name="l00443"></a>00443  *                        the classification set.
<a name="l00444"></a>00444  * classification_time    The time consumed during classification.
<a name="l00445"></a>00445  *
<a name="l00446"></a>00446  */
<a name="l00447"></a>00447 DROP TYPE IF EXISTS MADLIB_SCHEMA.rf_classify_result;
<a name="l00448"></a>00448 CREATE TYPE MADLIB_SCHEMA.rf_classify_result AS 
<a name="l00449"></a>00449     (   
<a name="l00450"></a>00450     input_set_size        BIGINT,   
<a name="l00451"></a>00451     classification_time   INTERVAL
<a name="l00452"></a>00452     );
<a name="l00453"></a>00453 <span class="comment"></span>
<a name="l00454"></a>00454 <span class="comment">/**</span>
<a name="l00455"></a>00455 <span class="comment"> * @brief This API is defined for training a random forest.  </span>
<a name="l00456"></a>00456 <span class="comment"> *        The training function provides a number of parameters that enables</span>
<a name="l00457"></a>00457 <span class="comment"> *        more flexible controls on how an RF is generated. It constructs the </span>
<a name="l00458"></a>00458 <span class="comment"> *        RF based on a training set stored in a database table, each row of </span>
<a name="l00459"></a>00459 <span class="comment"> *        which defines a set of features, an ID, and a labeled class. Features </span>
<a name="l00460"></a>00460 <span class="comment"> *        could be either discrete or continuous. All the DTs of the result RF </span>
<a name="l00461"></a>00461 <span class="comment"> *        will be kept in a single table. </span>
<a name="l00462"></a>00462 <span class="comment"> *</span>
<a name="l00463"></a>00463 <span class="comment"> * We discretize continuous features on local regions during training rather </span>
<a name="l00464"></a>00464 <span class="comment"> * than discretizing on the whole dataset prior to training because local </span>
<a name="l00465"></a>00465 <span class="comment"> * discretization takes into account the context sensitivity.</span>
<a name="l00466"></a>00466 <span class="comment"> *</span>
<a name="l00467"></a>00467 <span class="comment"> * @param split_criterion           The name of the split criterion that should be used </span>
<a name="l00468"></a>00468 <span class="comment"> *                                  for tree construction. The valid values are</span>
<a name="l00469"></a>00469 <span class="comment"> *                                  ‘infogain’, ‘gainratio’, and ‘gini’. It can&#39;t be NULL.</span>
<a name="l00470"></a>00470 <span class="comment"> *                                  Information gain(infogain) and gini index(gini) are biased </span>
<a name="l00471"></a>00471 <span class="comment"> *                                  toward multivalued attributes. Gain ratio(gainratio) adjusts </span>
<a name="l00472"></a>00472 <span class="comment"> *                                  for this bias. However, it tends to prefer unbalanced splits </span>
<a name="l00473"></a>00473 <span class="comment"> *                                  in which one partition is much smaller than the others.</span>
<a name="l00474"></a>00474 <span class="comment"> * @param training_table_name       The name of the table/view with the training data.</span>
<a name="l00475"></a>00475 <span class="comment"> *                                  It can&#39;t be NULL and must exist.</span>
<a name="l00476"></a>00476 <span class="comment"> * @param result_rf_table_name      The name of the table where the resulting trees will  </span>
<a name="l00477"></a>00477 <span class="comment"> *                                  be stored. It can&#39;t be NULL and must not exist.</span>
<a name="l00478"></a>00478 <span class="comment"> * @param num_trees                 The number of trees to be trained. </span>
<a name="l00479"></a>00479 <span class="comment"> *                                  If it&#39;s NULL, 10 will be used. </span>
<a name="l00480"></a>00480 <span class="comment"> * @param features_per_node         The number of features to be considered when finding </span>
<a name="l00481"></a>00481 <span class="comment"> *                                  a best split. If it&#39;s NULL, sqrt(p), where p is the  </span>
<a name="l00482"></a>00482 <span class="comment"> *                                  number of features, will be used. </span>
<a name="l00483"></a>00483 <span class="comment"> * @param sampling_percentage       The percentage of records sampled to train a tree.</span>
<a name="l00484"></a>00484 <span class="comment"> *                                  If it&#39;s NULL, 0.632 bootstrap will be used</span>
<a name="l00485"></a>00485 <span class="comment"> * @param continuous_feature_names  A comma-separated list of the names of the </span>
<a name="l00486"></a>00486 <span class="comment"> *                                  features whose values are continuous.</span>
<a name="l00487"></a>00487 <span class="comment"> *                                  NULL means there are no continuous features.  </span>
<a name="l00488"></a>00488 <span class="comment"> * @param feature_col_names         A comma-separated list of names of the table columns, </span>
<a name="l00489"></a>00489 <span class="comment"> *                                  each of which defines a feature. NULL means all the </span>
<a name="l00490"></a>00490 <span class="comment"> *                                  columns except the ID and Class columns will be treated as</span>
<a name="l00491"></a>00491 <span class="comment"> *                                  features.</span>
<a name="l00492"></a>00492 <span class="comment"> * @param id_col_name               The name of the column containing id of each record.</span>
<a name="l00493"></a>00493 <span class="comment"> *                                  It can&#39;t be NULL.</span>
<a name="l00494"></a>00494 <span class="comment"> * @param class_col_name            The name of the column containing correct class of </span>
<a name="l00495"></a>00495 <span class="comment"> *                                  each record. It can&#39;t be NULL.</span>
<a name="l00496"></a>00496 <span class="comment"> * @param how2handle_missing_value  The way to handle missing value. The valid values are </span>
<a name="l00497"></a>00497 <span class="comment"> *                                  &#39;explicit&#39; and &#39;ignore&#39;. It can&#39;t be NULL.</span>
<a name="l00498"></a>00498 <span class="comment"> * @param max_tree_depth            The maximum tree depth. It can&#39;t be NULL.</span>
<a name="l00499"></a>00499 <span class="comment"> * @param node_prune_threshold      The minimum percentage of the number of records required in a</span>
<a name="l00500"></a>00500 <span class="comment"> *                                  child node. It can&#39;t be NULL. The range of it is in [0.0, 1.0].</span>
<a name="l00501"></a>00501 <span class="comment"> *                                  This threshold only applies to the non-root nodes. Therefore,</span>
<a name="l00502"></a>00502 <span class="comment"> *                                  if the percentage(p) between the sampled training set size of a tree</span>
<a name="l00503"></a>00503 <span class="comment"> *                                  (the number of rows) and the total training set size is less than</span>
<a name="l00504"></a>00504 <span class="comment"> *                                  or equal to the value of this parameter, then the tree only has</span>
<a name="l00505"></a>00505 <span class="comment"> *                                  one node (the root node);</span>
<a name="l00506"></a>00506 <span class="comment"> *                                  if its value is 1, then the percentage p is less than or equal to 1</span>
<a name="l00507"></a>00507 <span class="comment"> *                                  definitely. Therefore, the tree only has one node (the root node).</span>
<a name="l00508"></a>00508 <span class="comment"> *                                  if its value is 0, then no nodes will be pruned by this parameter.</span>
<a name="l00509"></a>00509 <span class="comment"> * @param node_split_threshold      The minimum percentage of the number of records required in a</span>
<a name="l00510"></a>00510 <span class="comment"> *                                  node in order for a further split to be possible.</span>
<a name="l00511"></a>00511 <span class="comment"> *                                  It can&#39;t be NULL. The range of it is in [0.0, 1.0].</span>
<a name="l00512"></a>00512 <span class="comment"> *                                  If the percentage(p) between the sampled training set size of a tree</span>
<a name="l00513"></a>00513 <span class="comment"> *                                  (the number of rows) and the total training set size is less than</span>
<a name="l00514"></a>00514 <span class="comment"> *                                  the value of this parameter, then the root node will be a leaf one.</span>
<a name="l00515"></a>00515 <span class="comment"> *                                  Therefore, the trained tree only has one node.</span>
<a name="l00516"></a>00516 <span class="comment"> *                                  If the percentage p is equal to the value of this parameter, then the</span>
<a name="l00517"></a>00517 <span class="comment"> *                                  trained tree only has two levels, since only the root node will grow.</span>
<a name="l00518"></a>00518 <span class="comment"> *                                  (the root node);</span>
<a name="l00519"></a>00519 <span class="comment"> *                                  if its value is 0, then trees can grow extensively.</span>
<a name="l00520"></a>00520 <span class="comment"> * @param verbosity                 &gt; 0 means this function runs in verbose mode. </span>
<a name="l00521"></a>00521 <span class="comment"> *                                  It can&#39;t be NULL.</span>
<a name="l00522"></a>00522 <span class="comment"> *</span>
<a name="l00523"></a>00523 <span class="comment"> * @return An rf_train_result object.</span>
<a name="l00524"></a>00524 <span class="comment"> *</span>
<a name="l00525"></a>00525 <span class="comment"> */</span>
<a name="l00526"></a>00526 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.rf_train
<a name="l00527"></a>00527     (
<a name="l00528"></a>00528     split_criterion             TEXT,
<a name="l00529"></a>00529     training_table_name         TEXT, 
<a name="l00530"></a>00530     result_rf_table_name        TEXT,
<a name="l00531"></a>00531     num_trees                   INT, 
<a name="l00532"></a>00532     features_per_node           INT,
<a name="l00533"></a>00533     sampling_percentage         FLOAT,
<a name="l00534"></a>00534     continuous_feature_names    TEXT, 
<a name="l00535"></a>00535     feature_col_names           TEXT, 
<a name="l00536"></a><a class="code" href="rf_8sql__in.html#a3cf718282802b63dc0a0d19b34f6829b">00536</a>     id_col_name                 TEXT, 
<a name="l00537"></a>00537     class_col_name              TEXT, 
<a name="l00538"></a>00538     how2handle_missing_value    TEXT,
<a name="l00539"></a>00539     max_tree_depth              INT,
<a name="l00540"></a>00540     node_prune_threshold        FLOAT,
<a name="l00541"></a>00541     node_split_threshold        FLOAT, 
<a name="l00542"></a>00542     verbosity                   INT
<a name="l00543"></a>00543     ) 
<a name="l00544"></a>00544 RETURNS MADLIB_SCHEMA.rf_train_result AS $$
<a name="l00545"></a>00545 DECLARE
<a name="l00546"></a>00546     begin_func_exec                 TIMESTAMP;
<a name="l00547"></a>00547     rf_table_name                   TEXT;
<a name="l00548"></a>00548     h2hmv_routine_id                INT := 1;
<a name="l00549"></a>00549     ret                             MADLIB_SCHEMA.rf_train_result;
<a name="l00550"></a>00550     train_rs                        RECORD;
<a name="l00551"></a>00551     n_fids                          INT;
<a name="l00552"></a>00552     features_per_node_tmp           INT;
<a name="l00553"></a>00553     curstmt                         TEXT;
<a name="l00554"></a>00554     enc_info                        TEXT[];
<a name="l00555"></a>00555 BEGIN   
<a name="l00556"></a>00556     begin_func_exec = clock_timestamp();
<a name="l00557"></a>00557     
<a name="l00558"></a>00558     IF (verbosity &lt; 1) THEN
<a name="l00559"></a>00559         -- get rid of the messages whose severity level is lower than &#39;WARNING<span class="stringliteral">&#39;</span>
<a name="l00560"></a>00560 <span class="stringliteral">        SET client_min_messages = WARNING;</span>
<a name="l00561"></a>00561 <span class="stringliteral">    END IF;</span>
<a name="l00562"></a>00562 <span class="stringliteral"></span>
<a name="l00563"></a>00563 <span class="stringliteral">    PERFORM MADLIB_SCHEMA.__assert</span>
<a name="l00564"></a>00564 <span class="stringliteral">        (</span>
<a name="l00565"></a>00565 <span class="stringliteral">            num_trees IS NOT NULL                                   AND</span>
<a name="l00566"></a>00566 <span class="stringliteral">            sampling_percentage IS NOT NULL                         AND</span>
<a name="l00567"></a>00567 <span class="stringliteral">            num_trees  &gt; 0                                          AND</span>
<a name="l00568"></a>00568 <span class="stringliteral">            (features_per_node IS NULL OR  features_per_node &gt; 0)   AND</span>
<a name="l00569"></a>00569 <span class="stringliteral">            sampling_percentage &gt; 0,</span>
<a name="l00570"></a>00570 <span class="stringliteral">            &#39;</span>invalid parameter value <span class="keywordflow">for</span> num_trees, features_per_node or sampling_percentage<span class="stringliteral">&#39;</span>
<a name="l00571"></a>00571 <span class="stringliteral">        );</span>
<a name="l00572"></a>00572 <span class="stringliteral"></span>
<a name="l00573"></a>00573 <span class="stringliteral">    rf_table_name = btrim(lower(result_rf_table_name), &#39;</span> <span class="stringliteral">&#39;);</span>
<a name="l00574"></a>00574 <span class="stringliteral">    PERFORM MADLIB_SCHEMA.__check_dt_common_params</span>
<a name="l00575"></a>00575 <span class="stringliteral">        (</span>
<a name="l00576"></a>00576 <span class="stringliteral">            split_criterion,</span>
<a name="l00577"></a>00577 <span class="stringliteral">            training_table_name, </span>
<a name="l00578"></a>00578 <span class="stringliteral">            rf_table_name,</span>
<a name="l00579"></a>00579 <span class="stringliteral">            continuous_feature_names, </span>
<a name="l00580"></a>00580 <span class="stringliteral">            feature_col_names, </span>
<a name="l00581"></a>00581 <span class="stringliteral">            id_col_name, </span>
<a name="l00582"></a>00582 <span class="stringliteral">            class_col_name, </span>
<a name="l00583"></a>00583 <span class="stringliteral">            how2handle_missing_value,</span>
<a name="l00584"></a>00584 <span class="stringliteral">            max_tree_depth,</span>
<a name="l00585"></a>00585 <span class="stringliteral">            node_prune_threshold,</span>
<a name="l00586"></a>00586 <span class="stringliteral">            node_split_threshold, </span>
<a name="l00587"></a>00587 <span class="stringliteral">            verbosity,</span>
<a name="l00588"></a>00588 <span class="stringliteral">            &#39;</span>random forest<span class="stringliteral">&#39;</span>
<a name="l00589"></a>00589 <span class="stringliteral">        );</span>
<a name="l00590"></a>00590 <span class="stringliteral"></span>
<a name="l00591"></a>00591 <span class="stringliteral">    train_rs = MADLIB_SCHEMA.__encode_and_train</span>
<a name="l00592"></a>00592 <span class="stringliteral">        (</span>
<a name="l00593"></a>00593 <span class="stringliteral">            &#39;</span>RF<span class="stringliteral">&#39;,</span>
<a name="l00594"></a>00594 <span class="stringliteral">            split_criterion,</span>
<a name="l00595"></a>00595 <span class="stringliteral">            num_trees,</span>
<a name="l00596"></a>00596 <span class="stringliteral">            features_per_node,</span>
<a name="l00597"></a>00597 <span class="stringliteral">            training_table_name,</span>
<a name="l00598"></a>00598 <span class="stringliteral">            NULL,</span>
<a name="l00599"></a>00599 <span class="stringliteral">            rf_table_name,</span>
<a name="l00600"></a>00600 <span class="stringliteral">            continuous_feature_names, </span>
<a name="l00601"></a>00601 <span class="stringliteral">            feature_col_names, </span>
<a name="l00602"></a>00602 <span class="stringliteral">            id_col_name, </span>
<a name="l00603"></a>00603 <span class="stringliteral">            class_col_name, </span>
<a name="l00604"></a>00604 <span class="stringliteral">            100.0,</span>
<a name="l00605"></a>00605 <span class="stringliteral">            how2handle_missing_value,</span>
<a name="l00606"></a>00606 <span class="stringliteral">            max_tree_depth,</span>
<a name="l00607"></a>00607 <span class="stringliteral">            sampling_percentage,</span>
<a name="l00608"></a>00608 <span class="stringliteral">            &#39;</span>t<span class="stringliteral">&#39;,</span>
<a name="l00609"></a>00609 <span class="stringliteral">            node_prune_threshold,</span>
<a name="l00610"></a>00610 <span class="stringliteral">            node_split_threshold, </span>
<a name="l00611"></a>00611 <span class="stringliteral">            &#39;</span>&lt;RF table schema name&gt;_&lt;RF table name&gt;<span class="stringliteral">&#39;,</span>
<a name="l00612"></a>00612 <span class="stringliteral">            verbosity</span>
<a name="l00613"></a>00613 <span class="stringliteral">        );</span>
<a name="l00614"></a>00614 <span class="stringliteral"></span>
<a name="l00615"></a>00615 <span class="stringliteral">    IF ( verbosity &gt; 0 ) THEN</span>
<a name="l00616"></a>00616 <span class="stringliteral">            RAISE INFO &#39;</span>Training Total Time: %<span class="stringliteral">&#39;, clock_timestamp() - begin_func_exec;</span>
<a name="l00617"></a>00617 <span class="stringliteral">            RAISE INFO &#39;</span>training result:%<span class="stringliteral">&#39;, train_rs;</span>
<a name="l00618"></a>00618 <span class="stringliteral">    END IF;</span>
<a name="l00619"></a>00619 <span class="stringliteral"></span>
<a name="l00620"></a>00620 <span class="stringliteral">    ret.training_time           = clock_timestamp() - begin_func_exec;</span>
<a name="l00621"></a>00621 <span class="stringliteral">    ret.num_of_samples          = train_rs.num_of_samples;      </span>
<a name="l00622"></a>00622 <span class="stringliteral">    ret.num_trees               = num_trees; </span>
<a name="l00623"></a>00623 <span class="stringliteral">    ret.features_per_node       = train_rs.features_per_node; </span>
<a name="l00624"></a>00624 <span class="stringliteral">    ret.num_tree_nodes          = train_rs.num_tree_nodes; </span>
<a name="l00625"></a>00625 <span class="stringliteral">    ret.max_tree_depth          = train_rs.max_tree_depth;</span>
<a name="l00626"></a>00626 <span class="stringliteral">    ret.split_criterion         = split_criterion;</span>
<a name="l00627"></a>00627 <span class="stringliteral">    RETURN ret;</span>
<a name="l00628"></a>00628 <span class="stringliteral">END</span>
<a name="l00629"></a>00629 <span class="stringliteral">$$ LANGUAGE PLPGSQL;</span>
<a name="l00630"></a>00630 <span class="stringliteral"></span>
<a name="l00631"></a>00631 <span class="stringliteral"></span><span class="comment"></span>
<a name="l00632"></a>00632 <span class="comment">/**</span>
<a name="l00633"></a>00633 <span class="comment"> * @brief This API (short form) is defined for training a random forest.  </span>
<a name="l00634"></a>00634 <span class="comment"> *        For convenience, a short form of the training API with three parameters is </span>
<a name="l00635"></a>00635 <span class="comment"> *        also defined. This one needs only the split criterion name, the name of the </span>
<a name="l00636"></a>00636 <span class="comment"> *        table where training data is kept, and the name of the table where the </span>
<a name="l00637"></a>00637 <span class="comment"> *        trained RF should be kept. All other parameters in the full form will take </span>
<a name="l00638"></a>00638 <span class="comment"> *        their default values.</span>
<a name="l00639"></a>00639 <span class="comment"> *</span>
<a name="l00640"></a>00640 <span class="comment"> * @param split_criterion           The split criterion used for tree construction. </span>
<a name="l00641"></a>00641 <span class="comment"> *                                  The valid values are infogain, gainratio, or</span>
<a name="l00642"></a>00642 <span class="comment"> *                                  gini. It can&#39;t be NULL.</span>
<a name="l00643"></a>00643 <span class="comment"> * @param training_table_name       The name of the table/view with the training data.</span>
<a name="l00644"></a>00644 <span class="comment"> *                                  It can&#39;t be NULL and must exist.</span>
<a name="l00645"></a>00645 <span class="comment"> * @param result_rf_table_name      The name of the table where the resulting trees will  </span>
<a name="l00646"></a>00646 <span class="comment"> *                                  be stored. It can&#39;t be NULL and must not exist.</span>
<a name="l00647"></a>00647 <span class="comment"> *</span>
<a name="l00648"></a>00648 <span class="comment"> * @return An rf_train_result object.</span>
<a name="l00649"></a>00649 <span class="comment"> *</span>
<a name="l00650"></a>00650 <span class="comment"> */</span>
<a name="l00651"></a>00651 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.rf_train
<a name="l00652"></a>00652     (
<a name="l00653"></a>00653     split_criterion             TEXT,
<a name="l00654"></a>00654     training_table_name         TEXT, 
<a name="l00655"></a>00655     result_rf_table_name        TEXT
<a name="l00656"></a>00656     ) 
<a name="l00657"></a>00657 RETURNS MADLIB_SCHEMA.rf_train_result AS $$
<a name="l00658"></a>00658 DECLARE
<a name="l00659"></a>00659     ret                         MADLIB_SCHEMA.rf_train_result;
<a name="l00660"></a>00660 BEGIN   
<a name="l00661"></a><a class="code" href="rf_8sql__in.html#a3981c021e89c0c5f40ab436d96848845">00661</a>     /*
<a name="l00662"></a>00662         There is a well-known bootstrap method, called 0.632 bootstrap. According
<a name="l00663"></a>00663         to the book &quot;Data mining concepts and techniques, 3rd Edition&quot;, if we
<a name="l00664"></a>00664         are given a data set of D tuples and each tuple has a probability 1/d of 
<a name="l00665"></a>00665         being selected, so the probability of not being chosen is 1 − 1/d. We have
<a name="l00666"></a>00666         to select D times, so the probability that a tuple will not be chosen during
<a name="l00667"></a>00667         this whole time is (1−1/d)^D. If D is large, the probability approaches e^−1. 
<a name="l00668"></a>00668         Thus, 36.8% of tuples will not be selected for training. And the remaining 
<a name="l00669"></a>00669         63.2% will form the training set.
<a name="l00670"></a>00670         Therefore, we set the default value of &#39;sampling ratio<span class="stringliteral">&#39; to 0.632.</span>
<a name="l00671"></a>00671 <span class="stringliteral">    */</span>
<a name="l00672"></a>00672 <span class="stringliteral">    ret = MADLIB_SCHEMA.rf_train</span>
<a name="l00673"></a>00673 <span class="stringliteral">            (</span>
<a name="l00674"></a>00674 <span class="stringliteral">                split_criterion,</span>
<a name="l00675"></a>00675 <span class="stringliteral">                training_table_name,</span>
<a name="l00676"></a>00676 <span class="stringliteral">                result_rf_table_name,</span>
<a name="l00677"></a>00677 <span class="stringliteral">                10, </span>
<a name="l00678"></a>00678 <span class="stringliteral">                null,</span>
<a name="l00679"></a>00679 <span class="stringliteral">                0.632,</span>
<a name="l00680"></a>00680 <span class="stringliteral">                null,</span>
<a name="l00681"></a>00681 <span class="stringliteral">                null,</span>
<a name="l00682"></a>00682 <span class="stringliteral">                &#39;</span><span class="keywordtype">id</span><span class="stringliteral">&#39;,</span>
<a name="l00683"></a>00683 <span class="stringliteral">                &#39;</span><span class="keyword">class</span><span class="stringliteral">&#39;,</span>
<a name="l00684"></a>00684 <span class="stringliteral">                &#39;</span><span class="keyword">explicit</span><span class="stringliteral">&#39;,</span>
<a name="l00685"></a>00685 <span class="stringliteral">                10,</span>
<a name="l00686"></a>00686 <span class="stringliteral">                0.0,</span>
<a name="l00687"></a>00687 <span class="stringliteral">                0.0,</span>
<a name="l00688"></a>00688 <span class="stringliteral">                0,</span>
<a name="l00689"></a>00689 <span class="stringliteral">                0</span>
<a name="l00690"></a>00690 <span class="stringliteral">            );</span>
<a name="l00691"></a>00691 <span class="stringliteral">    </span>
<a name="l00692"></a>00692 <span class="stringliteral">    RETURN ret;</span>
<a name="l00693"></a>00693 <span class="stringliteral">END</span>
<a name="l00694"></a>00694 <span class="stringliteral">$$ LANGUAGE PLPGSQL;</span>
<a name="l00695"></a>00695 <span class="stringliteral"></span>
<a name="l00696"></a>00696 <span class="stringliteral"></span><span class="comment"></span>
<a name="l00697"></a>00697 <span class="comment">/**</span>
<a name="l00698"></a>00698 <span class="comment"> * @brief Display the trees in the random forest with human readable format.</span>
<a name="l00699"></a>00699 <span class="comment"> *</span>
<a name="l00700"></a>00700 <span class="comment"> * @param rf_table_name The name of RF table. It can&#39;t be NULL and must exist.</span>
<a name="l00701"></a>00701 <span class="comment"> * @param tree_id       The trees to be displayed. If it&#39;s NULL, we </span>
<a name="l00702"></a>00702 <span class="comment"> *                      display all the trees.</span>
<a name="l00703"></a>00703 <span class="comment"> * @param max_depth     The max depth to be displayed. If It&#39;s NULL, this </span>
<a name="l00704"></a>00704 <span class="comment"> *                      function will show all levels.</span>
<a name="l00705"></a>00705 <span class="comment"> *                    </span>
<a name="l00706"></a>00706 <span class="comment"> * @return The text representing the trees in random forest with human </span>
<a name="l00707"></a>00707 <span class="comment"> *         readable format.</span>
<a name="l00708"></a>00708 <span class="comment"> *</span>
<a name="l00709"></a>00709 <span class="comment"> */</span>
<a name="l00710"></a>00710 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.rf_display
<a name="l00711"></a>00711     (
<a name="l00712"></a>00712     rf_table_name   TEXT,
<a name="l00713"></a>00713     tree_id         INT[],
<a name="l00714"></a>00714     max_depth       INT
<a name="l00715"></a>00715     ) 
<a name="l00716"></a>00716 RETURNS SETOF TEXT AS $$
<a name="l00717"></a>00717 DECLARE
<a name="l00718"></a>00718     tid     INT;
<a name="l00719"></a>00719     tids    INT[];
<a name="l00720"></a><a class="code" href="rf_8sql__in.html#a60b3f093409a14331b7601522ac0ac42">00720</a>     str     TEXT;
<a name="l00721"></a>00721     max_tid INT;
<a name="l00722"></a>00722     i       INT;
<a name="l00723"></a>00723 BEGIN
<a name="l00724"></a>00724     -- get rid of the messages whose severity level is lower than &#39;WARNING<span class="stringliteral">&#39;</span>
<a name="l00725"></a>00725 <span class="stringliteral">    SET client_min_messages = WARNING;</span>
<a name="l00726"></a>00726 <span class="stringliteral">    </span>
<a name="l00727"></a>00727 <span class="stringliteral">    PERFORM MADLIB_SCHEMA.__assert</span>
<a name="l00728"></a>00728 <span class="stringliteral">            (</span>
<a name="l00729"></a>00729 <span class="stringliteral">                (rf_table_name IS NOT NULL) AND</span>
<a name="l00730"></a>00730 <span class="stringliteral">                (</span>
<a name="l00731"></a>00731 <span class="stringliteral">                 MADLIB_SCHEMA.__table_exists</span>
<a name="l00732"></a>00732 <span class="stringliteral">                    (</span>
<a name="l00733"></a>00733 <span class="stringliteral">                        rf_table_name</span>
<a name="l00734"></a>00734 <span class="stringliteral">                    )</span>
<a name="l00735"></a>00735 <span class="stringliteral">                ),</span>
<a name="l00736"></a>00736 <span class="stringliteral">                &#39;</span>the specified tree table<span class="stringliteral">&#39; || </span>
<a name="l00737"></a>00737 <span class="stringliteral">                coalesce</span>
<a name="l00738"></a>00738 <span class="stringliteral">                (</span>
<a name="l00739"></a>00739 <span class="stringliteral">                    &#39;</span>&lt;<span class="stringliteral">&#39; || rf_table_name || &#39;</span>&gt; does not exists<span class="stringliteral">&#39;, </span>
<a name="l00740"></a>00740 <span class="stringliteral">                    &#39;</span> is NULL<span class="stringliteral">&#39;</span>
<a name="l00741"></a>00741 <span class="stringliteral">                )</span>
<a name="l00742"></a>00742 <span class="stringliteral">            ); </span>
<a name="l00743"></a>00743 <span class="stringliteral"></span>
<a name="l00744"></a>00744 <span class="stringliteral">    PERFORM MADLIB_SCHEMA.__assert</span>
<a name="l00745"></a>00745 <span class="stringliteral">            (</span>
<a name="l00746"></a>00746 <span class="stringliteral">                max_depth IS NULL OR</span>
<a name="l00747"></a>00747 <span class="stringliteral">                max_depth &gt; 0,</span>
<a name="l00748"></a>00748 <span class="stringliteral">                &#39;</span>the max tree depth must be NULL or greater than 0<span class="stringliteral">&#39;              </span>
<a name="l00749"></a>00749 <span class="stringliteral">            );   </span>
<a name="l00750"></a>00750 <span class="stringliteral"></span>
<a name="l00751"></a>00751 <span class="stringliteral">    -- IF tree_id is null, display all these trees</span>
<a name="l00752"></a>00752 <span class="stringliteral">    IF (tree_id IS NULL) THEN</span>
<a name="l00753"></a>00753 <span class="stringliteral">        FOR tid IN EXECUTE &#39;</span>SELECT distinct tid FROM <span class="stringliteral">&#39;||rf_table_name LOOP</span>
<a name="l00754"></a>00754 <span class="stringliteral">            tids = array_append(tids, tid);</span>
<a name="l00755"></a>00755 <span class="stringliteral">        END LOOP;</span>
<a name="l00756"></a>00756 <span class="stringliteral">    ELSE</span>
<a name="l00757"></a>00757 <span class="stringliteral">        tids = tree_id;</span>
<a name="l00758"></a>00758 <span class="stringliteral">        EXECUTE &#39;</span>SELECT max(tid) FROM <span class="stringliteral">&#39;||rf_table_name INTO max_tid;</span>
<a name="l00759"></a>00759 <span class="stringliteral">     </span>
<a name="l00760"></a>00760 <span class="stringliteral">        FOR i IN 1..array_upper(tids, 1) LOOP</span>
<a name="l00761"></a>00761 <span class="stringliteral">            tid = tids[i];</span>
<a name="l00762"></a>00762 <span class="stringliteral">            PERFORM MADLIB_SCHEMA.__assert</span>
<a name="l00763"></a>00763 <span class="stringliteral">                    (</span>
<a name="l00764"></a>00764 <span class="stringliteral">                        tid IS NOT NULL AND</span>
<a name="l00765"></a>00765 <span class="stringliteral">                        tid &gt; 0         AND</span>
<a name="l00766"></a>00766 <span class="stringliteral">                        tid &lt;= max_tid, </span>
<a name="l00767"></a>00767 <span class="stringliteral">                        &#39;</span>the ID of the tree in the array must be in range [1, <span class="stringliteral">&#39; || </span>
<a name="l00768"></a>00768 <span class="stringliteral">                        max_tid                                                 || </span>
<a name="l00769"></a>00769 <span class="stringliteral">                        &#39;</span>]<span class="stringliteral">&#39;              </span>
<a name="l00770"></a>00770 <span class="stringliteral">                    );          </span>
<a name="l00771"></a>00771 <span class="stringliteral">        END LOOP;</span>
<a name="l00772"></a>00772 <span class="stringliteral">    END IF;</span>
<a name="l00773"></a>00773 <span class="stringliteral">    </span>
<a name="l00774"></a>00774 <span class="stringliteral">    FOR str IN SELECT * FROM </span>
<a name="l00775"></a>00775 <span class="stringliteral">m4_changequote(`&gt;&gt;&gt;&#39;</span>, `&lt;&lt;&lt;<span class="stringliteral">&#39;)</span>
<a name="l00776"></a>00776 <span class="stringliteral">m4_ifdef(&gt;&gt;&gt;__HAS_ORDERED_AGGREGATES__&lt;&lt;&lt;, &gt;&gt;&gt;</span>
<a name="l00777"></a>00777 <span class="stringliteral">         MADLIB_SCHEMA.__treemodel_display_with_ordered_aggr</span>
<a name="l00778"></a>00778 <span class="stringliteral">         (</span>
<a name="l00779"></a>00779 <span class="stringliteral">            rf_table_name,</span>
<a name="l00780"></a>00780 <span class="stringliteral">            tids,</span>
<a name="l00781"></a>00781 <span class="stringliteral">            max_depth</span>
<a name="l00782"></a>00782 <span class="stringliteral">         ) LOOP</span>
<a name="l00783"></a>00783 <span class="stringliteral">&lt;&lt;&lt;, &gt;&gt;&gt;</span>
<a name="l00784"></a>00784 <span class="stringliteral">         MADLIB_SCHEMA.__treemodel_display_no_ordered_aggr</span>
<a name="l00785"></a>00785 <span class="stringliteral">         (</span>
<a name="l00786"></a>00786 <span class="stringliteral">            rf_table_name,</span>
<a name="l00787"></a>00787 <span class="stringliteral">            tids,</span>
<a name="l00788"></a>00788 <span class="stringliteral">            max_depth</span>
<a name="l00789"></a>00789 <span class="stringliteral">         ) LOOP</span>
<a name="l00790"></a>00790 <span class="stringliteral">&lt;&lt;&lt;)</span>
<a name="l00791"></a>00791 <span class="stringliteral">m4_changequote(&gt;&gt;&gt;`&lt;&lt;&lt;, &gt;&gt;&gt;&#39;</span>&lt;&lt;&lt;)
<a name="l00792"></a>00792         RETURN NEXT str;
<a name="l00793"></a>00793     END LOOP;
<a name="l00794"></a>00794     RETURN;
<a name="l00795"></a>00795 END $$ LANGUAGE PLPGSQL;
<a name="l00796"></a>00796 
<a name="l00797"></a>00797 <span class="comment"></span>
<a name="l00798"></a>00798 <span class="comment">/**</span>
<a name="l00799"></a>00799 <span class="comment"> * @brief Display the trees in the random forest with human readable format.</span>
<a name="l00800"></a>00800 <span class="comment"> *        This function displays all the levels of these specified trees.</span>
<a name="l00801"></a>00801 <span class="comment"> *</span>
<a name="l00802"></a>00802 <span class="comment"> * @param rf_table_name The name of RF table. It can&#39;t be NULL and must exist.</span>
<a name="l00803"></a>00803 <span class="comment"> * @param tree_id       The trees to be displayed. If it&#39;s NULL, we </span>
<a name="l00804"></a>00804 <span class="comment"> *                      display all the trees.</span>
<a name="l00805"></a>00805 <span class="comment"> *                    </span>
<a name="l00806"></a>00806 <span class="comment"> * @return The text representing the trees in random forest with human </span>
<a name="l00807"></a>00807 <span class="comment"> *         readable format.</span>
<a name="l00808"></a>00808 <span class="comment"> *</span>
<a name="l00809"></a>00809 <span class="comment"> */</span>
<a name="l00810"></a>00810 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.rf_display
<a name="l00811"></a>00811     (
<a name="l00812"></a>00812     rf_table_name   TEXT,
<a name="l00813"></a><a class="code" href="rf_8sql__in.html#a8ff04ad76d40eed5036706b365403376">00813</a>     tree_id         INT[]    
<a name="l00814"></a>00814     ) 
<a name="l00815"></a>00815 RETURNS SETOF TEXT AS $$
<a name="l00816"></a>00816 DECLARE
<a name="l00817"></a>00817     str     TEXT;
<a name="l00818"></a>00818 BEGIN
<a name="l00819"></a>00819     FOR str IN SELECT * FROM 
<a name="l00820"></a>00820                MADLIB_SCHEMA.rf_display(rf_table_name,tree_id,NULL) LOOP
<a name="l00821"></a>00821         RETURN NEXT str;
<a name="l00822"></a>00822     END LOOP;
<a name="l00823"></a>00823     RETURN;
<a name="l00824"></a>00824 END $$ LANGUAGE PLPGSQL;
<a name="l00825"></a>00825 
<a name="l00826"></a>00826 <span class="comment"></span>
<a name="l00827"></a>00827 <span class="comment">/**</span>
<a name="l00828"></a>00828 <span class="comment"> * @brief Display the trees in the random forest with human readable format.</span>
<a name="l00829"></a>00829 <span class="comment"> *        This function displays all the levels of all trees in RF.</span>
<a name="l00830"></a>00830 <span class="comment"> *</span>
<a name="l00831"></a>00831 <span class="comment"> * @param rf_table_name The name of RF table. It can&#39;t be NULL and must exist.</span>
<a name="l00832"></a>00832 <span class="comment"></span>
<a name="l00833"></a>00833 <span class="comment"> *                    </span>
<a name="l00834"></a>00834 <span class="comment"> * @return The text representing the trees in random forest with human </span>
<a name="l00835"></a>00835 <span class="comment"> *         readable format.</span>
<a name="l00836"></a>00836 <span class="comment"> *</span>
<a name="l00837"></a>00837 <span class="comment"> */</span>
<a name="l00838"></a>00838 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.rf_display
<a name="l00839"></a>00839     (
<a name="l00840"></a>00840     rf_table_name  TEXT
<a name="l00841"></a><a class="code" href="rf_8sql__in.html#af89e4b67475e2e57039382467fa43747">00841</a>     ) 
<a name="l00842"></a>00842 RETURNS SETOF TEXT AS $$
<a name="l00843"></a>00843 DECLARE
<a name="l00844"></a>00844     str     TEXT;
<a name="l00845"></a>00845 BEGIN
<a name="l00846"></a>00846     FOR str IN SELECT * FROM 
<a name="l00847"></a>00847                MADLIB_SCHEMA.rf_display(rf_table_name,NULL) LOOP
<a name="l00848"></a>00848         RETURN NEXT str;
<a name="l00849"></a>00849     END LOOP;
<a name="l00850"></a>00850     RETURN;
<a name="l00851"></a>00851 END $$ LANGUAGE PLPGSQL;
<a name="l00852"></a>00852 
<a name="l00853"></a>00853 <span class="comment"></span>
<a name="l00854"></a>00854 <span class="comment">/**</span>
<a name="l00855"></a>00855 <span class="comment"> * @brief Classify dataset using a trained RF.</span>
<a name="l00856"></a>00856 <span class="comment"> *</span>
<a name="l00857"></a>00857 <span class="comment"> * The classification result will be stored in the table which is defined </span>
<a name="l00858"></a>00858 <span class="comment"> * as: </span>
<a name="l00859"></a>00859 <span class="comment"> .</span>
<a name="l00860"></a>00860 <span class="comment"> *  CREATE TABLE classification_result</span>
<a name="l00861"></a>00861 <span class="comment"> *  (</span>
<a name="l00862"></a>00862 <span class="comment"> *     id        INT|BIGINT,</span>
<a name="l00863"></a>00863 <span class="comment"> *     class     SUPPORTED_DATA_TYPE,</span>
<a name="l00864"></a>00864 <span class="comment"> *     prob      FLOAT</span>
<a name="l00865"></a>00865 <span class="comment"> *  );</span>
<a name="l00866"></a>00866 <span class="comment"> * </span>
<a name="l00867"></a>00867 <span class="comment"> * @param rf_table_name             The name of RF table. It can&#39;t be NULL.</span>
<a name="l00868"></a>00868 <span class="comment"> * @param classification_table_name The name of the table/view that keeps the data </span>
<a name="l00869"></a>00869 <span class="comment"> *                                  to be classified. It can&#39;t be NULL and must exist.</span>
<a name="l00870"></a>00870 <span class="comment"> * @param result_table_name         The name of result table. It can&#39;t be NULL and must exist. </span>
<a name="l00871"></a>00871 <span class="comment"> * @param is_serial_classification  Whether classify with all trees at a </span>
<a name="l00872"></a>00872 <span class="comment"> *                                  time or one by one. It can&#39;t be NULL.</span>
<a name="l00873"></a>00873 <span class="comment"> * @param verbosity                 &gt; 0 means this function runs in verbose mode. </span>
<a name="l00874"></a>00874 <span class="comment"> *                                  It can&#39;t be NULL. </span>
<a name="l00875"></a>00875 <span class="comment"> *</span>
<a name="l00876"></a>00876 <span class="comment"> * @return A rf_classify_result object.</span>
<a name="l00877"></a>00877 <span class="comment"> *</span>
<a name="l00878"></a>00878 <span class="comment"> */</span>
<a name="l00879"></a>00879 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.rf_classify
<a name="l00880"></a>00880     (
<a name="l00881"></a>00881     rf_table_name               TEXT, 
<a name="l00882"></a><a class="code" href="rf_8sql__in.html#a116584e8a5194a4277e964900ee5383d">00882</a>     classification_table_name   TEXT, 
<a name="l00883"></a>00883     result_table_name           TEXT, 
<a name="l00884"></a>00884     is_serial_classification    BOOLEAN,
<a name="l00885"></a>00885     verbosity                   INT
<a name="l00886"></a>00886     ) 
<a name="l00887"></a>00887 RETURNS MADLIB_SCHEMA.rf_classify_result AS $$
<a name="l00888"></a>00888 DECLARE
<a name="l00889"></a>00889     encoded_table_name  TEXT := <span class="stringliteral">&#39;&#39;</span>;
<a name="l00890"></a>00890     temp_result_table   TEXT := <span class="stringliteral">&#39;&#39;</span>;
<a name="l00891"></a>00891     vote_result_table   TEXT;
<a name="l00892"></a>00892     metatable_name      TEXT;
<a name="l00893"></a>00893     result_rec          RECORD;
<a name="l00894"></a>00894     begin_time          TIMESTAMP;
<a name="l00895"></a>00895     curstmt             TEXT;
<a name="l00896"></a>00896     ret                 MADLIB_SCHEMA.rf_classify_result;
<a name="l00897"></a>00897     table_names         TEXT[];
<a name="l00898"></a>00898 BEGIN
<a name="l00899"></a>00899     IF (verbosity &gt; 0) THEN
<a name="l00900"></a>00900         -- get rid of the messages whose severity level is lower than &#39;WARNING&#39;
<a name="l00901"></a>00901         SET client_min_messages = WARNING;
<a name="l00902"></a>00902     END IF;
<a name="l00903"></a>00903     
<a name="l00904"></a>00904     begin_time = clock_timestamp();
<a name="l00905"></a>00905 
<a name="l00906"></a>00906     PERFORM MADLIB_SCHEMA.__assert
<a name="l00907"></a>00907             (
<a name="l00908"></a>00908                 is_serial_classification IS NOT NULL,
<a name="l00909"></a>00909                 &#39;is_serial_classification must not be null&#39;              
<a name="l00910"></a>00910             );  
<a name="l00911"></a>00911 
<a name="l00912"></a>00912     PERFORM MADLIB_SCHEMA.__assert
<a name="l00913"></a>00913             (
<a name="l00914"></a>00914                 (result_table_name IS NOT NULL) AND
<a name="l00915"></a>00915                 (
<a name="l00916"></a>00916                  NOT MADLIB_SCHEMA.__table_exists
<a name="l00917"></a>00917                     (
<a name="l00918"></a>00918                         result_table_name
<a name="l00919"></a>00919                     )
<a name="l00920"></a>00920                 ),
<a name="l00921"></a>00921                 &#39;the specified result table&#39; || coalesce(&#39;&lt;&#39; || result_table_name || &#39;&gt; exists&#39;, &#39; is NULL&#39;)
<a name="l00922"></a>00922             ); 
<a name="l00923"></a>00923 
<a name="l00924"></a>00924     IF (is_serial_classification) THEN
<a name="l00925"></a>00925         table_names = MADLIB_SCHEMA.__treemodel_classify_internal_serial
<a name="l00926"></a>00926                         (
<a name="l00927"></a>00927                             classification_table_name, 
<a name="l00928"></a>00928                             rf_table_name, 
<a name="l00929"></a>00929                             verbosity
<a name="l00930"></a>00930                         );
<a name="l00931"></a>00931     ELSE
<a name="l00932"></a>00932         table_names = MADLIB_SCHEMA.__treemodel_classify_internal
<a name="l00933"></a>00933                         (
<a name="l00934"></a>00934                             classification_table_name, 
<a name="l00935"></a>00935                             rf_table_name, 
<a name="l00936"></a>00936                             verbosity
<a name="l00937"></a>00937                         );
<a name="l00938"></a>00938     END IF;
<a name="l00939"></a>00939     
<a name="l00940"></a>00940     encoded_table_name= table_names[1];
<a name="l00941"></a>00941     temp_result_table = table_names[2];
<a name="l00942"></a>00942     vote_result_table = temp_result_table||&#39;_vote&#39;;
<a name="l00943"></a>00943 
<a name="l00944"></a>00944     PERFORM MADLIB_SCHEMA.__treemodel_get_vote_result
<a name="l00945"></a>00945         (
<a name="l00946"></a>00946         temp_result_table, 
<a name="l00947"></a>00947         vote_result_table
<a name="l00948"></a>00948         );
<a name="l00949"></a>00949 
<a name="l00950"></a>00950     metatable_name = MADLIB_SCHEMA.__get_metatable_name( rf_table_name );
<a name="l00951"></a>00951 
<a name="l00952"></a>00952     SELECT MADLIB_SCHEMA.__format
<a name="l00953"></a>00953         (
<a name="l00954"></a>00954             &#39;SELECT 
<a name="l00955"></a>00955                 column_name,
<a name="l00956"></a>00956                 MADLIB_SCHEMA.__regclass_to_text(table_oid) as table_name 
<a name="l00957"></a>00957              FROM %
<a name="l00958"></a>00958              WHERE column_type=&#39;&#39;c&#39;&#39; LIMIT 1&#39;,
<a name="l00959"></a>00959             ARRAY[
<a name="l00960"></a>00960                 metatable_name
<a name="l00961"></a>00961             ]
<a name="l00962"></a>00962         ) INTO curstmt;
<a name="l00963"></a>00963     
<a name="l00964"></a>00964     EXECUTE curstmt INTO result_rec;
<a name="l00965"></a>00965             
<a name="l00966"></a>00966    -- translate the encoded class information back
<a name="l00967"></a>00967     EXECUTE &#39;CREATE TABLE &#39;||result_table_name||&#39; AS SELECT n.<span class="keywordtype">id</span>, 
<a name="l00968"></a>00968              m.fval as class,n.prob from &#39;||vote_result_table||
<a name="l00969"></a>00969         &#39; n,&#39;||result_rec.table_name||&#39; m where n.class=m.code 
<a name="l00970"></a>00970         m4_ifdef(`__GREENPLUM__&#39;, `DISTRIBUTED BY (<span class="keywordtype">id</span>)&#39;);&#39;;
<a name="l00971"></a>00971         
<a name="l00972"></a>00972     EXECUTE &#39;DROP TABLE IF EXISTS &#39; || encoded_table_name || &#39;;&#39;;
<a name="l00973"></a>00973     EXECUTE &#39;DROP TABLE IF EXISTS &#39; || temp_result_table || &#39;;&#39;;
<a name="l00974"></a>00974     EXECUTE &#39;DROP TABLE IF EXISTS &#39; || vote_result_table || &#39;;&#39;;
<a name="l00975"></a>00975     EXECUTE &#39;SELECT COUNT(*) FROM &#39; ||classification_table_name||&#39;;&#39; 
<a name="l00976"></a>00976              INTO ret.input_set_size;
<a name="l00977"></a>00977     
<a name="l00978"></a>00978     ret.classification_time = clock_timestamp() - begin_time;
<a name="l00979"></a>00979     RETURN ret;
<a name="l00980"></a>00980 END
<a name="l00981"></a>00981 $$ LANGUAGE PLPGSQL;
<a name="l00982"></a>00982 
<a name="l00983"></a>00983 <span class="comment"></span>
<a name="l00984"></a>00984 <span class="comment">/**</span>
<a name="l00985"></a>00985 <span class="comment"> * @brief Classify dataset using a trained RF. This function does </span>
<a name="l00986"></a>00986 <span class="comment"> *        the same thing as the full version defined as above except </span>
<a name="l00987"></a>00987 <span class="comment"> *        that it will only use parallel classification. </span>
<a name="l00988"></a>00988 <span class="comment"> *  </span>
<a name="l00989"></a>00989 <span class="comment"> * @param rf_table_name             The name of RF table. It can&#39;t be NULL.</span>
<a name="l00990"></a>00990 <span class="comment"> * @param classification_table_name The name of the table/view that keeps the data </span>
<a name="l00991"></a>00991 <span class="comment"> *                                  to be classified. It can&#39;t be NULL and must exist.</span>
<a name="l00992"></a>00992 <span class="comment"> * @param result_table_name         The name of result table. It can&#39;t be NULL and must exist. </span>
<a name="l00993"></a>00993 <span class="comment"> * @param verbosity                 &gt; 0 means this function runs in verbose mode. </span>
<a name="l00994"></a>00994 <span class="comment"> *                                  It can&#39;t be NULL. </span>
<a name="l00995"></a>00995 <span class="comment"> *</span>
<a name="l00996"></a>00996 <span class="comment"> * @return A rf_classify_result object.</span>
<a name="l00997"></a>00997 <span class="comment"> *</span>
<a name="l00998"></a>00998 <span class="comment"> */</span>
<a name="l00999"></a>00999 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.<a class="code" href="rf_8sql__in.html#a116584e8a5194a4277e964900ee5383d" title="Classify dataset using a trained RF.">rf_classify</a>
<a name="l01000"></a>01000     (
<a name="l01001"></a>01001     rf_table_name               TEXT, 
<a name="l01002"></a><a class="code" href="rf_8sql__in.html#a19941b576b2d8ec04d3ffa7e766ba5a3">01002</a>     classification_table_name   TEXT, 
<a name="l01003"></a>01003     result_table_name           TEXT,
<a name="l01004"></a>01004     verbosity                   INT
<a name="l01005"></a>01005     ) 
<a name="l01006"></a>01006 RETURNS MADLIB_SCHEMA.rf_classify_result AS $$
<a name="l01007"></a>01007 DECLARE
<a name="l01008"></a>01008     ret MADLIB_SCHEMA.rf_classify_result;
<a name="l01009"></a>01009 BEGIN
<a name="l01010"></a>01010     ret = MADLIB_SCHEMA.<a class="code" href="rf_8sql__in.html#a116584e8a5194a4277e964900ee5383d" title="Classify dataset using a trained RF.">rf_classify</a>
<a name="l01011"></a>01011               (
<a name="l01012"></a>01012               rf_table_name,
<a name="l01013"></a>01013               classification_table_name, 
<a name="l01014"></a>01014               result_table_name,
<a name="l01015"></a>01015               &#39;f&#39;,
<a name="l01016"></a>01016               verbosity
<a name="l01017"></a>01017               );
<a name="l01018"></a>01018            
<a name="l01019"></a>01019     RETURN ret;
<a name="l01020"></a>01020 END $$ LANGUAGE PLPGSQL;
<a name="l01021"></a>01021 
<a name="l01022"></a>01022 <span class="comment"></span>
<a name="l01023"></a>01023 <span class="comment">/**</span>
<a name="l01024"></a>01024 <span class="comment"> * @brief Classify dataset using a trained RF. This function does </span>
<a name="l01025"></a>01025 <span class="comment"> *        the same thing as the full version defined as above except </span>
<a name="l01026"></a>01026 <span class="comment"> *        that it will only use parallel classification and run in </span>
<a name="l01027"></a>01027 <span class="comment"> *        quiet mode. </span>
<a name="l01028"></a>01028 <span class="comment"> *  </span>
<a name="l01029"></a>01029 <span class="comment"> * @param rf_table_name             The name of RF table. It can&#39;t be NULL.</span>
<a name="l01030"></a>01030 <span class="comment"> * @param classification_table_name The name of the table/view that keeps the data </span>
<a name="l01031"></a>01031 <span class="comment"> *                                  to be classified. It can&#39;t be NULL and must exist.</span>
<a name="l01032"></a>01032 <span class="comment"> * @param result_table_name         The name of result table. It can&#39;t be NULL and must exist. </span>
<a name="l01033"></a>01033 <span class="comment"> *</span>
<a name="l01034"></a>01034 <span class="comment"> * @return A rf_classify_result object.</span>
<a name="l01035"></a>01035 <span class="comment"> *</span>
<a name="l01036"></a>01036 <span class="comment"> */</span>
<a name="l01037"></a>01037 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.<a class="code" href="rf_8sql__in.html#a116584e8a5194a4277e964900ee5383d" title="Classify dataset using a trained RF.">rf_classify</a>
<a name="l01038"></a>01038     (
<a name="l01039"></a>01039     rf_table_name               TEXT, 
<a name="l01040"></a><a class="code" href="rf_8sql__in.html#a57cd1d51be539e0da4fff351f8b477fe">01040</a>     classification_table_name   TEXT, 
<a name="l01041"></a>01041     result_table_name           TEXT
<a name="l01042"></a>01042     ) 
<a name="l01043"></a>01043 RETURNS MADLIB_SCHEMA.rf_classify_result AS $$
<a name="l01044"></a>01044 DECLARE
<a name="l01045"></a>01045     ret MADLIB_SCHEMA.rf_classify_result;
<a name="l01046"></a>01046 BEGIN
<a name="l01047"></a>01047     ret = MADLIB_SCHEMA.<a class="code" href="rf_8sql__in.html#a116584e8a5194a4277e964900ee5383d" title="Classify dataset using a trained RF.">rf_classify</a>
<a name="l01048"></a>01048               (
<a name="l01049"></a>01049               rf_table_name,
<a name="l01050"></a>01050               classification_table_name, 
<a name="l01051"></a>01051               result_table_name,
<a name="l01052"></a>01052               &#39;f&#39;,
<a name="l01053"></a>01053               0
<a name="l01054"></a>01054               );
<a name="l01055"></a>01055            
<a name="l01056"></a>01056     RETURN ret;
<a name="l01057"></a>01057 END $$ LANGUAGE PLPGSQL;
<a name="l01058"></a>01058 
<a name="l01059"></a>01059 <span class="comment"></span>
<a name="l01060"></a>01060 <span class="comment">/**</span>
<a name="l01061"></a>01061 <span class="comment"> * @brief Check the accuracy of a trained RF with a scoring set.</span>
<a name="l01062"></a>01062 <span class="comment"> * </span>
<a name="l01063"></a>01063 <span class="comment"> * @param rf_table_name             The name of RF table. It can&#39;t be NULL.</span>
<a name="l01064"></a>01064 <span class="comment"> * @param scoring_table_name        The name of the table/view that keeps the data </span>
<a name="l01065"></a>01065 <span class="comment"> *                                  to be scored. It can&#39;t be NULL and must exist.</span>
<a name="l01066"></a>01066 <span class="comment"> * @param verbosity                 &gt; 0 means this function runs in verbose mode. </span>
<a name="l01067"></a>01067 <span class="comment"> *                                  It can&#39;t be NULL. </span>
<a name="l01068"></a>01068 <span class="comment"> *</span>
<a name="l01069"></a>01069 <span class="comment"> * @return The estimated accuracy information.</span>
<a name="l01070"></a>01070 <span class="comment"> *</span>
<a name="l01071"></a>01071 <span class="comment"> */</span>
<a name="l01072"></a>01072 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.<a class="code" href="rf_8sql__in.html#ac6745c3d4cae3443b217f3ba66d11ac4" title="Check the accuracy of a trained RF with a scoring set.">rf_score</a>
<a name="l01073"></a>01073     (
<a name="l01074"></a>01074     rf_table_name               TEXT, 
<a name="l01075"></a><a class="code" href="rf_8sql__in.html#ac6745c3d4cae3443b217f3ba66d11ac4">01075</a>     scoring_table_name          TEXT, 
<a name="l01076"></a>01076     verbosity                   INT
<a name="l01077"></a>01077     ) 
<a name="l01078"></a>01078 RETURNS FLOAT8 AS $$
<a name="l01079"></a>01079 BEGIN
<a name="l01080"></a>01080     RETURN  MADLIB_SCHEMA.__treemodel_score
<a name="l01081"></a>01081                    (
<a name="l01082"></a>01082                    rf_table_name,
<a name="l01083"></a>01083                    scoring_table_name,
<a name="l01084"></a>01084                    verbosity
<a name="l01085"></a>01085                    );
<a name="l01086"></a>01086 END;
<a name="l01087"></a>01087 $$ LANGUAGE PLPGSQL;
<a name="l01088"></a>01088 
<a name="l01089"></a>01089 <span class="comment"></span>
<a name="l01090"></a>01090 <span class="comment">/**</span>
<a name="l01091"></a>01091 <span class="comment"> * @brief Check the accuracy of a trained RF with a scoring set in quiet mode.</span>
<a name="l01092"></a>01092 <span class="comment"> * </span>
<a name="l01093"></a>01093 <span class="comment"> * @param rf_table_name             The name of RF table. It can&#39;t be NULL.</span>
<a name="l01094"></a>01094 <span class="comment"> * @param scoring_table_name        The name of the table/view that keeps the data </span>
<a name="l01095"></a>01095 <span class="comment"> *                                  to be scored. It can&#39;t be NULL and must exist.</span>
<a name="l01096"></a>01096 <span class="comment"> *</span>
<a name="l01097"></a>01097 <span class="comment"> * @return The estimated accuracy information.</span>
<a name="l01098"></a>01098 <span class="comment"> *</span>
<a name="l01099"></a>01099 <span class="comment"> */</span>
<a name="l01100"></a>01100 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.<a class="code" href="rf_8sql__in.html#ac6745c3d4cae3443b217f3ba66d11ac4" title="Check the accuracy of a trained RF with a scoring set.">rf_score</a>
<a name="l01101"></a>01101     (
<a name="l01102"></a>01102     rf_table_name               TEXT, 
<a name="l01103"></a><a class="code" href="rf_8sql__in.html#a9fd5da138e06924e89541ce4035ce8e1">01103</a>     scoring_table_name          TEXT
<a name="l01104"></a>01104     ) 
<a name="l01105"></a>01105 RETURNS FLOAT8 AS $$
<a name="l01106"></a>01106 BEGIN
<a name="l01107"></a>01107     RETURN MADLIB_SCHEMA.<a class="code" href="rf_8sql__in.html#ac6745c3d4cae3443b217f3ba66d11ac4" title="Check the accuracy of a trained RF with a scoring set.">rf_score</a>(rf_table_name, scoring_table_name, 0);
<a name="l01108"></a>01108 END;
<a name="l01109"></a>01109 $$ LANGUAGE PLPGSQL;
<a name="l01110"></a>01110 
<a name="l01111"></a>01111 <span class="comment"></span>
<a name="l01112"></a>01112 <span class="comment">/**</span>
<a name="l01113"></a>01113 <span class="comment"> * @brief Cleanup the trained random forest table and any relevant tables.</span>
<a name="l01114"></a>01114 <span class="comment"> * </span>
<a name="l01115"></a>01115 <span class="comment"> * @param rf_table_name             The name of RF table. It can&#39;t be NULL.</span>
<a name="l01116"></a>01116 <span class="comment"> * </span>
<a name="l01117"></a>01117 <span class="comment"> * @return The status of that cleanup operation.</span>
<a name="l01118"></a>01118 <span class="comment"> *</span>
<a name="l01119"></a>01119 <span class="comment"> */</span>
<a name="l01120"></a>01120 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.<a class="code" href="rf_8sql__in.html#af33b77b75df225ee65a8acf18705256e" title="Cleanup the trained random forest table and any relevant tables.">rf_clean</a>
<a name="l01121"></a>01121     ( 
<a name="l01122"></a>01122     rf_table_name TEXT
<a name="l01123"></a><a class="code" href="rf_8sql__in.html#af33b77b75df225ee65a8acf18705256e">01123</a>     ) 
<a name="l01124"></a>01124 RETURNS BOOLEAN AS $$
<a name="l01125"></a>01125 DECLARE
<a name="l01126"></a>01126     result BOOLEAN;
<a name="l01127"></a>01127 BEGIN
<a name="l01128"></a>01128     result = MADLIB_SCHEMA.__treemodel_clean(rf_table_name);
<a name="l01129"></a>01129     RETURN result;
<a name="l01130"></a>01130 END
<a name="l01131"></a>01131 $$ LANGUAGE PLPGSQL;
</pre></div></div>
</div>
  <div id="nav-path" class="navpath">
    <ul>
      <li class="navelem"><a class="el" href="rf_8sql__in.html">rf.sql_in</a>      </li>
<!-- window showing the filter options -->
<div id="MSearchSelectWindow"
     onmouseover="return searchBox.OnSearchSelectShow()"
     onmouseout="return searchBox.OnSearchSelectHide()"
     onkeydown="return searchBox.OnSearchSelectKey(event)">
<a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(0)"><span class="SelectionMark">&#160;</span>All</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(1)"><span class="SelectionMark">&#160;</span>Files</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(2)"><span class="SelectionMark">&#160;</span>Functions</a></div>

<!-- iframe showing the search results (closed by default) -->
<div id="MSearchResultsWindow">
<iframe src="javascript:void(0)" frameborder="0" 
        name="MSearchResults" id="MSearchResults">
</iframe>
</div>


    <li class="footer">Generated on Tue Apr 2 2013 14:57:03 for MADlib by
    <a href="http://www.doxygen.org/index.html">
    <img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.7.5.1 </li>
   </ul>
 </div>


</body>
</html>
