docs/v0.7/dt_8sql__in_source.html - madlib-site - Git at Google

 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
 <html xmlns="http://www.w3.org/1999/xhtml">
 <head>
 <meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
 <title>MADlib: dt.sql_in Source File</title>

 <link href="tabs.css" rel="stylesheet" type="text/css"/>
 <link href="doxygen.css" rel="stylesheet" type="text/css" />
 <link href="navtree.css" rel="stylesheet" type="text/css"/>
 <script type="text/javascript" src="jquery.js"></script>
 <script type="text/javascript" src="resize.js"></script>
 <script type="text/javascript" src="navtree.js"></script>
 <script type="text/javascript">
   $(document).ready(initResizable);
 </script>
 <link href="search/search.css" rel="stylesheet" type="text/css"/>
 <script type="text/javascript" src="search/search.js"></script>
 <script type="text/javascript">
   $(document).ready(function() { searchBox.OnSelectItem(0); });
 </script>
 <script src="../mathjax/MathJax.js">
   MathJax.Hub.Config({
     extensions: ["tex2jax.js", "TeX/AMSmath.js", "TeX/AMSsymbols.js"],
     jax: ["input/TeX","output/HTML-CSS"],
 });
 </script>
 </head>
 <body>
 <div id="top"><!-- do not remove this div! -->


 <div id="titlearea">
 <table cellspacing="0" cellpadding="0">
  <tbody>
  <tr style="height: 56px;">


   <td style="padding-left: 0.5em;">
    <div id="projectname">MADlib
    &#160;<span id="projectnumber">0.7</span> <span style="font-size:10pt; font-style:italic"><a href="../latest/./dt_8sql__in_source.html"> A newer version is available</a></span>
    </div>
    <div id="projectbrief">User Documentation</div>
   </td>


  </tr>
  </tbody>
 </table>
 </div>

 <!-- Generated by Doxygen 1.7.5.1 -->
 <script type="text/javascript">
 var searchBox = new SearchBox("searchBox", "search",false,'Search');
 </script>
 <script type="text/javascript" src="dynsections.js"></script>
   <div id="navrow1" class="tabs">
     <ul class="tablist">
       <li><a href="index.html"><span>Main&#160;Page</span></a></li>
       <li><a href="modules.html"><span>Modules</span></a></li>
       <li class="current"><a href="files.html"><span>Files</span></a></li>
       <li>
         <div id="MSearchBox" class="MSearchBoxInactive">
         <span class="left">
           <img id="MSearchSelect" src="search/mag_sel.png"
                onmouseover="return searchBox.OnSearchSelectShow()"
                onmouseout="return searchBox.OnSearchSelectHide()"
                alt=""/>
           <input type="text" id="MSearchField" value="Search" accesskey="S"
                onfocus="searchBox.OnSearchFieldFocus(true)"
                onblur="searchBox.OnSearchFieldFocus(false)"
                onkeyup="searchBox.OnSearchFieldChange(event)"/>
           </span><span class="right">
             <a id="MSearchClose" href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" border="0" src="search/close.png" alt=""/></a>
           </span>
         </div>
       </li>
     </ul>
   </div>
   <div id="navrow2" class="tabs2">
     <ul class="tablist">
       <li><a href="files.html"><span>File&#160;List</span></a></li>
       <li><a href="globals.html"><span>File&#160;Members</span></a></li>
     </ul>
   </div>
 </div>
 <div id="side-nav" class="ui-resizable side-nav-resizable">
   <div id="nav-tree">
     <div id="nav-tree-contents">
     </div>
   </div>
   <div id="splitbar" style="-moz-user-select:none;"
        class="ui-resizable-handle">
   </div>
 </div>
 <script type="text/javascript">
   initNavTree('dt_8sql__in.html','');
 </script>
 <div id="doc-content">
 <div class="header">
   <div class="headertitle">
 <div class="title">dt.sql_in</div>  </div>
 </div>
 <div class="contents">
 <a href="dt_8sql__in.html">Go to the documentation of this file.</a><div class="fragment"><pre class="fragment"><a name="l00001"></a>00001 <span class="comment">/* ----------------------------------------------------------------------- */</span><span class="comment">/**</span>
 <a name="l00002"></a>00002 <span class="comment"> *</span>
 <a name="l00003"></a>00003 <span class="comment"> * @file dt.sql_in</span>
 <a name="l00004"></a>00004 <span class="comment"> *</span>
 <a name="l00005"></a>00005 <span class="comment"> * @brief the common functions written in PL/PGSQL shared by C4.5 and RF</span>
 <a name="l00006"></a>00006 <span class="comment"> * @date April 5, 2012</span>
 <a name="l00007"></a>00007 <span class="comment"> *</span>
 <a name="l00008"></a>00008 <span class="comment"> */</span><span class="comment">/* ----------------------------------------------------------------------- */</span>
 <a name="l00009"></a>00009
 <a name="l00010"></a>00010 m4_include(`SQLCommon.m4<span class="stringliteral">&#39;)</span>
 <a name="l00011"></a>00011 <span class="stringliteral"></span>
 <a name="l00012"></a>00012 <span class="stringliteral">/* Own macro definitions */</span>
 <a name="l00013"></a>00013 <span class="stringliteral">m4_ifelse(</span>
 <a name="l00014"></a>00014 <span class="stringliteral">    m4_eval(</span>
 <a name="l00015"></a>00015 <span class="stringliteral">        m4_ifdef(`__GREENPLUM__&#39;</span>, 1, 0) &amp;&amp;
 <a name="l00016"></a>00016         __DBMS_VERSION_MAJOR__ * 100 + __DBMS_VERSION_MINOR__ &lt; 401
 <a name="l00017"></a>00017     ), 1,
 <a name="l00018"></a>00018     `m4_define(`__GREENPLUM_PRE_4_1__<span class="charliteral">&#39;)&#39;</span>
 <a name="l00019"></a>00019 )
 <a name="l00020"></a>00020 m4_ifelse(
 <a name="l00021"></a>00021     m4_eval(
 <a name="l00022"></a>00022         m4_ifdef(`__POSTGRESQL__<span class="stringliteral">&#39;, 1, 0) &amp;&amp;</span>
 <a name="l00023"></a>00023 <span class="stringliteral">        __DBMS_VERSION_MAJOR__ &lt; 9</span>
 <a name="l00024"></a>00024 <span class="stringliteral">    ), 1,</span>
 <a name="l00025"></a>00025 <span class="stringliteral">    `m4_define(`__POSTGRESQL_PRE_9_0__&#39;</span>)<span class="stringliteral">&#39;</span>
 <a name="l00026"></a>00026 <span class="stringliteral">)</span>
 <a name="l00027"></a>00027 <span class="stringliteral"></span>
 <a name="l00028"></a>00028 <span class="stringliteral">m4_ifelse(</span>
 <a name="l00029"></a>00029 <span class="stringliteral">    m4_eval(</span>
 <a name="l00030"></a>00030 <span class="stringliteral">        m4_ifdef(`__GREENPLUM__&#39;</span>, 1, 0) &amp;&amp;
 <a name="l00031"></a>00031         __DBMS_VERSION_MAJOR__ * 10000 +
 <a name="l00032"></a>00032             __DBMS_VERSION_MINOR__ * 100 +
 <a name="l00033"></a>00033             __DBMS_VERSION_PATCH__ &gt;= 40201
 <a name="l00034"></a>00034     ), 1,
 <a name="l00035"></a>00035     `m4_define(`__GREENPLUM_GE_4_2_1__<span class="charliteral">&#39;)&#39;</span>
 <a name="l00036"></a>00036 )
 <a name="l00037"></a>00037
 <a name="l00038"></a>00038 <span class="comment">/*</span>
 <a name="l00039"></a>00039 <span class="comment"> * This is a global table to store information for various tree training.</span>
 <a name="l00040"></a>00040 <span class="comment"> *</span>
 <a name="l00041"></a>00041 <span class="comment"> *   classifier_name             The name of the classifier, e.g, &#39;C4.5&#39; or &#39;RF&#39;.</span>
 <a name="l00042"></a>00042 <span class="comment"> *   result_table_oid            The OID of the result table.</span>
 <a name="l00043"></a>00043 <span class="comment"> *   training_table_oid          The OID of the training table.</span>
 <a name="l00044"></a>00044 <span class="comment"> *   training_metatable_oid      The OID of the metadata table.</span>
 <a name="l00045"></a>00045 <span class="comment"> *   training_encoded_table_oid  The OID of the encoded table.</span>
 <a name="l00046"></a>00046 <span class="comment"> *   validation_table_oid        The OID of the validation table.</span>
 <a name="l00047"></a>00047 <span class="comment"> *   how2handle_missing_value    The approach name to handle missing value.</span>
 <a name="l00048"></a>00048 <span class="comment"> *   split_criterion             The name of the split criterion for this training.</span>
 <a name="l00049"></a>00049 <span class="comment"> *   sampling_percentage         The sampling percentage for training each tree.</span>
 <a name="l00050"></a>00050 <span class="comment"> *   num_feature_chosen          The number of features will be chosen to find best split.</span>
 <a name="l00051"></a>00051 <span class="comment"> *   num_trees                   The number of trees will be grow in training.</span>
 <a name="l00052"></a>00052 <span class="comment"> *</span>
 <a name="l00053"></a>00053 <span class="comment"> */</span>
 <a name="l00054"></a>00054 DROP TABLE IF EXISTS MADLIB_SCHEMA.training_info;
 <a name="l00055"></a>00055 CREATE TABLE MADLIB_SCHEMA.training_info
 <a name="l00056"></a>00056     (
 <a name="l00057"></a>00057     classifier_name             TEXT NOT NULL,
 <a name="l00058"></a>00058     result_table_oid            OID NOT NULL,
 <a name="l00059"></a>00059     training_table_oid          OID,
 <a name="l00060"></a>00060     training_metatable_oid      OID,
 <a name="l00061"></a>00061     training_encoded_table_oid  OID,
 <a name="l00062"></a>00062     validation_table_oid        OID,
 <a name="l00063"></a>00063     how2handle_missing_value    TEXT,
 <a name="l00064"></a>00064     split_criterion             TEXT,
 <a name="l00065"></a>00065     sampling_percentage         FLOAT,
 <a name="l00066"></a>00066     num_feature_chosen          INT,
 <a name="l00067"></a>00067     num_trees                   INT,
 <a name="l00068"></a>00068     PRIMARY KEY (result_table_oid)
 <a name="l00069"></a>00069     ) m4_ifdef(`__GREENPLUM__&#39;, `DISTRIBUTED BY (result_table_oid)&#39;);
 <a name="l00070"></a>00070 GRANT SELECT, INSERT, UPDATE, DELETE ON MADLIB_SCHEMA.training_info TO PUBLIC;
 <a name="l00071"></a>00071
 <a name="l00072"></a>00072
 <a name="l00073"></a>00073 <span class="comment">/*</span>
 <a name="l00074"></a>00074 <span class="comment"> * @brief Remove the trained tree from training info table. </span>
 <a name="l00075"></a>00075 <span class="comment"> *</span>
 <a name="l00076"></a>00076 <span class="comment"> * @param tree_table    The full name of the tree table.</span>
 <a name="l00077"></a>00077 <span class="comment"> *</span>
 <a name="l00078"></a>00078 <span class="comment"> */</span>
 <a name="l00079"></a>00079 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__delete_traininginfo
 <a name="l00080"></a>00080     (
 <a name="l00081"></a>00081     tree_table TEXT
 <a name="l00082"></a>00082     )
 <a name="l00083"></a>00083 RETURNS <span class="keywordtype">void</span> AS $$
 <a name="l00084"></a>00084 BEGIN
 <a name="l00085"></a>00085     DELETE FROM MADLIB_SCHEMA.training_info
 <a name="l00086"></a>00086     WHERE result_table_oid = tree_table::regclass;
 <a name="l00087"></a>00087 end
 <a name="l00088"></a>00088 $$ LANGUAGE PLPGSQL;
 <a name="l00089"></a>00089
 <a name="l00090"></a>00090
 <a name="l00091"></a>00091 <span class="comment">/*</span>
 <a name="l00092"></a>00092 <span class="comment"> * @brief Insert the trained tree into training info table. </span>
 <a name="l00093"></a>00093 <span class="comment"> *</span>
 <a name="l00094"></a>00094 <span class="comment"> * @param classifier_table_name         The name of the classifier.</span>
 <a name="l00095"></a>00095 <span class="comment"> * @param result_table_name             The full name of the training result table.</span>
 <a name="l00096"></a>00096 <span class="comment"> * @param training_table_name           The full name of the training table.</span>
 <a name="l00097"></a>00097 <span class="comment"> * @param training_metatable_name       The full name of metatable.</span>
 <a name="l00098"></a>00098 <span class="comment"> * @param training_encoded_table_name   The full name of the encoded table. </span>
 <a name="l00099"></a>00099 <span class="comment"> * @param validation_table_name         The full name of the validation table.</span>
 <a name="l00100"></a>00100 <span class="comment"> * @param how2handle_missing_value      The name of the routine to process unknown </span>
 <a name="l00101"></a>00101 <span class="comment"> *                                      values.</span>
 <a name="l00102"></a>00102 <span class="comment"> * @param split_criterion               The name of split criterion.</span>
 <a name="l00103"></a>00103 <span class="comment"> * @param sampling_percentage           The percentage of bootstrap samples size in </span>
 <a name="l00104"></a>00104 <span class="comment"> *                                      training dataset.</span>
 <a name="l00105"></a>00105 <span class="comment"> * @param num_features_chosen           The number of features to split on each tree</span>
 <a name="l00106"></a>00106 <span class="comment"> *                                      node. </span>
 <a name="l00107"></a>00107 <span class="comment"> * @param num_trees                     The number of trees after completed the </span>
 <a name="l00108"></a>00108 <span class="comment"> *                                      training process.</span>
 <a name="l00109"></a>00109 <span class="comment"> * </span>
 <a name="l00110"></a>00110 <span class="comment"> */</span>
 <a name="l00111"></a>00111 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__insert_into_traininginfo
 <a name="l00112"></a>00112     (
 <a name="l00113"></a>00113     classifier_table_name       TEXT,
 <a name="l00114"></a>00114     result_table_name           TEXT,
 <a name="l00115"></a>00115     training_table_name         TEXT,
 <a name="l00116"></a>00116     training_metatable_name     TEXT,
 <a name="l00117"></a>00117     training_encoded_table_name TEXT,
 <a name="l00118"></a>00118     validation_table_name       TEXT,
 <a name="l00119"></a>00119     how2handle_missing_value    TEXT,
 <a name="l00120"></a>00120     split_criterion             TEXT,
 <a name="l00121"></a>00121     sampling_percentage         FLOAT,
 <a name="l00122"></a>00122     num_features_chosen         INT,
 <a name="l00123"></a>00123     num_trees                   INT
 <a name="l00124"></a>00124     )
 <a name="l00125"></a>00125 RETURNS <span class="keywordtype">void</span> AS $$
 <a name="l00126"></a>00126 BEGIN
 <a name="l00127"></a>00127     INSERT INTO MADLIB_SCHEMA.training_info VALUES
 <a name="l00128"></a>00128         (
 <a name="l00129"></a>00129             classifier_table_name,
 <a name="l00130"></a>00130             result_table_name::regclass,
 <a name="l00131"></a>00131             training_table_name::regclass,
 <a name="l00132"></a>00132             training_metatable_name::regclass,
 <a name="l00133"></a>00133             training_encoded_table_name::regclass,
 <a name="l00134"></a>00134             validation_table_name::regclass,
 <a name="l00135"></a>00135             how2handle_missing_value,
 <a name="l00136"></a>00136             split_criterion,
 <a name="l00137"></a>00137             sampling_percentage,
 <a name="l00138"></a>00138             num_features_chosen,
 <a name="l00139"></a>00139             num_trees
 <a name="l00140"></a>00140         );
 <a name="l00141"></a>00141 END
 <a name="l00142"></a>00142 $$ LANGUAGE PLPGSQL;
 <a name="l00143"></a>00143
 <a name="l00144"></a>00144
 <a name="l00145"></a>00145 <span class="comment">/*</span>
 <a name="l00146"></a>00146 <span class="comment"> * @brief Get the name of the encoded table.  </span>
 <a name="l00147"></a>00147 <span class="comment"> *</span>
 <a name="l00148"></a>00148 <span class="comment"> * @param tree_table    The full name of the tree table.</span>
 <a name="l00149"></a>00149 <span class="comment"> *</span>
 <a name="l00150"></a>00150 <span class="comment"> * @return The full name of the encoded table.</span>
 <a name="l00151"></a>00151 <span class="comment"> *</span>
 <a name="l00152"></a>00152 <span class="comment"> */</span>
 <a name="l00153"></a>00153 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_encode_table_name
 <a name="l00154"></a>00154     (
 <a name="l00155"></a>00155     tree_table TEXT
 <a name="l00156"></a>00156     )
 <a name="l00157"></a>00157 RETURNS TEXT AS $$
 <a name="l00158"></a>00158 DECLARE
 <a name="l00159"></a>00159     encoded_table_name TEXT := &#39;&#39;;
 <a name="l00160"></a>00160 BEGIN
 <a name="l00161"></a>00161     SELECT MADLIB_SCHEMA.__regclass_to_text(training_encoded_table_oid)
 <a name="l00162"></a>00162     FROM MADLIB_SCHEMA.training_info
 <a name="l00163"></a>00163     WHERE result_table_oid = tree_table::regclass
 <a name="l00164"></a>00164     INTO encoded_table_name;
 <a name="l00165"></a>00165
 <a name="l00166"></a>00166     RETURN encoded_table_name;
 <a name="l00167"></a>00167 END
 <a name="l00168"></a>00168 $$ LANGUAGE PLPGSQL STABLE;
 <a name="l00169"></a>00169
 <a name="l00170"></a>00170
 <a name="l00171"></a>00171 <span class="comment">/*</span>
 <a name="l00172"></a>00172 <span class="comment"> * @brief Test if the given table is a valid encoded one. </span>
 <a name="l00173"></a>00173 <span class="comment"> *        A valid encoded table has the following characteristic:</span>
 <a name="l00174"></a>00174 <span class="comment"> *            + Its OID is in the column &quot;training_encoded_table_oid&quot;</span>
 <a name="l00175"></a>00175 <span class="comment"> *              of training_info table.</span>
 <a name="l00176"></a>00176 <span class="comment"> *            + It has 5 columns, whose names are id, fid, fval,</span>
 <a name="l00177"></a>00177 <span class="comment"> *              is_cont and class.</span>
 <a name="l00178"></a>00178 <span class="comment"> *            + The types of the 5 columns are BIGINT, INT, FLOAT8</span>
 <a name="l00179"></a>00179 <span class="comment"> *              BOOL and INT.</span>
 <a name="l00180"></a>00180 <span class="comment"> *</span>
 <a name="l00181"></a>00181 <span class="comment"> * @param enc_tbl_name    The full name of the encoded table.</span>
 <a name="l00182"></a>00182 <span class="comment"> *</span>
 <a name="l00183"></a>00183 <span class="comment"> * @return Ture if the given table is a valid encoded one.</span>
 <a name="l00184"></a>00184 <span class="comment"> *         False if it&#39;s an invalid encoded table.</span>
 <a name="l00185"></a>00185 <span class="comment"> *</span>
 <a name="l00186"></a>00186 <span class="comment"> */</span>
 <a name="l00187"></a>00187 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__is_valid_enc_table
 <a name="l00188"></a>00188     (
 <a name="l00189"></a>00189     enc_tbl_name TEXT
 <a name="l00190"></a>00190     )
 <a name="l00191"></a>00191 RETURNS BOOL AS $$
 <a name="l00192"></a>00192 DECLARE
 <a name="l00193"></a>00193     num_enc_table  INT;
 <a name="l00194"></a>00194     num_cols       INT;
 <a name="l00195"></a>00195     ret            BOOL := &#39;f&#39;::BOOL;
 <a name="l00196"></a>00196 BEGIN
 <a name="l00197"></a>00197     -- test if the table is in the training_info table
 <a name="l00198"></a>00198     SELECT count(*)
 <a name="l00199"></a>00199     FROM MADLIB_SCHEMA.training_info
 <a name="l00200"></a>00200     WHERE MADLIB_SCHEMA.__regclass_to_text(training_encoded_table_oid) =
 <a name="l00201"></a>00201             enc_tbl_name
 <a name="l00202"></a>00202     INTO num_enc_table;
 <a name="l00203"></a>00203
 <a name="l00204"></a>00204     -- test if the name and the type of a column are valid or not
 <a name="l00205"></a>00205     SELECT count(*)
 <a name="l00206"></a>00206     FROM pg_attribute
 <a name="l00207"></a>00207     WHERE attrelid= enc_tbl_name::regclass::oid AND
 <a name="l00208"></a>00208           attnum &gt; 0 AND
 <a name="l00209"></a>00209           not attisdropped AND
 <a name="l00210"></a>00210           attname in (&#39;<span class="keywordtype">id</span>&#39;, &#39;fid&#39;, &#39;fval&#39;, &#39;is_cont&#39;, &#39;class&#39;) AND
 <a name="l00211"></a>00211           atttypid in (&#39;int8&#39;::regtype, &#39;<span class="keywordtype">int</span>&#39;::regtype, &#39;float8&#39;::regtype,
 <a name="l00212"></a>00212                        &#39;<span class="keywordtype">bool</span>&#39;::regtype, &#39;<span class="keywordtype">int</span>&#39;::regtype)
 <a name="l00213"></a>00213     INTO num_cols;
 <a name="l00214"></a>00214
 <a name="l00215"></a>00215     IF ((num_enc_table &gt; 0) AND (num_cols = 5)) THEN
 <a name="l00216"></a>00216         ret = &#39;t&#39;::BOOL;
 <a name="l00217"></a>00217     END IF;
 <a name="l00218"></a>00218
 <a name="l00219"></a>00219     RETURN ret;
 <a name="l00220"></a>00220 END
 <a name="l00221"></a>00221 $$ LANGUAGE PLPGSQL;
 <a name="l00222"></a>00222
 <a name="l00223"></a>00223
 <a name="l00224"></a>00224 <span class="comment">/*</span>
 <a name="l00225"></a>00225 <span class="comment"> * @brief Get the meta table name by the tree table name. </span>
 <a name="l00226"></a>00226 <span class="comment"> *</span>
 <a name="l00227"></a>00227 <span class="comment"> * @param tree_table    The full name of the tree table.</span>
 <a name="l00228"></a>00228 <span class="comment"> * </span>
 <a name="l00229"></a>00229 <span class="comment"> * @return The full name of the metatable.</span>
 <a name="l00230"></a>00230 <span class="comment"> *</span>
 <a name="l00231"></a>00231 <span class="comment"> */</span>
 <a name="l00232"></a>00232 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_metatable_name
 <a name="l00233"></a>00233     (
 <a name="l00234"></a>00234     tree_table TEXT
 <a name="l00235"></a>00235     )
 <a name="l00236"></a>00236 RETURNS TEXT AS $$
 <a name="l00237"></a>00237 DECLARE
 <a name="l00238"></a>00238     metatable_name TEXT := &#39;&#39;;
 <a name="l00239"></a>00239 BEGIN
 <a name="l00240"></a>00240
 <a name="l00241"></a>00241     PERFORM MADLIB_SCHEMA.__assert_table
 <a name="l00242"></a>00242             (
 <a name="l00243"></a>00243                 tree_table::TEXT,
 <a name="l00244"></a>00244                 &#39;t&#39;::BOOL
 <a name="l00245"></a>00245             );
 <a name="l00246"></a>00246
 <a name="l00247"></a>00247     PERFORM MADLIB_SCHEMA.__assert_table
 <a name="l00248"></a>00248             (
 <a name="l00249"></a>00249                 &#39;MADLIB_SCHEMA.training_info&#39;::TEXT,
 <a name="l00250"></a>00250                 &#39;t&#39;::BOOL
 <a name="l00251"></a>00251             );
 <a name="l00252"></a>00252
 <a name="l00253"></a>00253     SELECT MADLIB_SCHEMA.__regclass_to_text(training_metatable_oid)
 <a name="l00254"></a>00254     FROM MADLIB_SCHEMA.training_info
 <a name="l00255"></a>00255     WHERE result_table_oid = tree_table::regclass
 <a name="l00256"></a>00256     INTO metatable_name;
 <a name="l00257"></a>00257
 <a name="l00258"></a>00258     RETURN metatable_name;
 <a name="l00259"></a>00259 END
 <a name="l00260"></a>00260 $$ LANGUAGE PLPGSQL;
 <a name="l00261"></a>00261
 <a name="l00262"></a>00262
 <a name="l00263"></a>00263 <span class="comment">/*</span>
 <a name="l00264"></a>00264 <span class="comment"> * @brief Get the unknown values processing routine id. </span>
 <a name="l00265"></a>00265 <span class="comment"> *</span>
 <a name="l00266"></a>00266 <span class="comment"> * @param tree_table    The full name of the tree table.</span>
 <a name="l00267"></a>00267 <span class="comment"> *</span>
 <a name="l00268"></a>00268 <span class="comment"> * @return The encoded missing value processing routine id.</span>
 <a name="l00269"></a>00269 <span class="comment"> *</span>
 <a name="l00270"></a>00270 <span class="comment"> */</span>
 <a name="l00271"></a>00271 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_routine_id
 <a name="l00272"></a>00272     (
 <a name="l00273"></a>00273     tree_table TEXT
 <a name="l00274"></a>00274     )
 <a name="l00275"></a>00275 RETURNS INT AS $$
 <a name="l00276"></a>00276 DECLARE
 <a name="l00277"></a>00277     name TEXT;
 <a name="l00278"></a>00278 BEGIN
 <a name="l00279"></a>00279     name = MADLIB_SCHEMA.__get_routine_name(tree_table);
 <a name="l00280"></a>00280
 <a name="l00281"></a>00281     IF (name = &#39;ignore&#39;) THEN
 <a name="l00282"></a>00282         RETURN 1;
 <a name="l00283"></a>00283     ELSIF (name = &#39;explicit&#39;) THEN
 <a name="l00284"></a>00284         RETURN 2;
 <a name="l00285"></a>00285     ELSE
 <a name="l00286"></a>00286         RAISE EXCEPTION &#39;__get_routine_id: %&#39;, name;
 <a name="l00287"></a>00287     END IF;
 <a name="l00288"></a>00288
 <a name="l00289"></a>00289 END
 <a name="l00290"></a>00290 $$ LANGUAGE PLPGSQL;
 <a name="l00291"></a>00291
 <a name="l00292"></a>00292
 <a name="l00293"></a>00293 <span class="comment">/*</span>
 <a name="l00294"></a>00294 <span class="comment"> * @brief Get the unknown values processing routine name. </span>
 <a name="l00295"></a>00295 <span class="comment"> *        The valid routine name is &#39;ignore&#39; or &#39;explicit&#39;.</span>
 <a name="l00296"></a>00296 <span class="comment"> *</span>
 <a name="l00297"></a>00297 <span class="comment"> * @param tree_table    The full name of the tree table.</span>
 <a name="l00298"></a>00298 <span class="comment"> *</span>
 <a name="l00299"></a>00299 <span class="comment"> * @return The encoded missing value processing routine name.</span>
 <a name="l00300"></a>00300 <span class="comment"> *</span>
 <a name="l00301"></a>00301 <span class="comment"> */</span>
 <a name="l00302"></a>00302 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_routine_name
 <a name="l00303"></a>00303     (
 <a name="l00304"></a>00304     tree_table TEXT
 <a name="l00305"></a>00305     )
 <a name="l00306"></a>00306 RETURNS TEXT AS $$
 <a name="l00307"></a>00307 DECLARE
 <a name="l00308"></a>00308     curstmt    TEXT;
 <a name="l00309"></a>00309     name       TEXT;
 <a name="l00310"></a>00310 BEGIN
 <a name="l00311"></a>00311     PERFORM MADLIB_SCHEMA.__assert_table
 <a name="l00312"></a>00312         (
 <a name="l00313"></a>00313             &#39;MADLIB_SCHEMA.training_info&#39;,
 <a name="l00314"></a>00314             &#39;t&#39;
 <a name="l00315"></a>00315         );
 <a name="l00316"></a>00316
 <a name="l00317"></a>00317     curstmt = MADLIB_SCHEMA.__format
 <a name="l00318"></a>00318         (
 <a name="l00319"></a>00319             &#39;SELECT how2handle_missing_value
 <a name="l00320"></a>00320              FROM   MADLIB_SCHEMA.training_info
 <a name="l00321"></a>00321              WHERE  result_table_oid = &#39;&#39;%&#39;&#39;::regclass&#39;,
 <a name="l00322"></a>00322             tree_table
 <a name="l00323"></a>00323         );
 <a name="l00324"></a>00324     EXECUTE curstmt INTO name;
 <a name="l00325"></a>00325
 <a name="l00326"></a>00326     RETURN name;
 <a name="l00327"></a>00327 END
 <a name="l00328"></a>00328 $$ LANGUAGE PLPGSQL;
 <a name="l00329"></a>00329
 <a name="l00330"></a>00330
 <a name="l00331"></a>00331 <span class="comment">/*</span>
 <a name="l00332"></a>00332 <span class="comment"> * @brief Get the name of the tree table from the encoded table name. </span>
 <a name="l00333"></a>00333 <span class="comment"> *</span>
 <a name="l00334"></a>00334 <span class="comment"> * @param enc_table_name  The encoded table name.  </span>
 <a name="l00335"></a>00335 <span class="comment"> *</span>
 <a name="l00336"></a>00336 <span class="comment"> * @return The full name of the tree table.</span>
 <a name="l00337"></a>00337 <span class="comment"> *</span>
 <a name="l00338"></a>00338 <span class="comment"> */</span>
 <a name="l00339"></a>00339 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_tree_table_name
 <a name="l00340"></a>00340     (
 <a name="l00341"></a>00341     enc_table_name TEXT
 <a name="l00342"></a>00342     )
 <a name="l00343"></a>00343 RETURNS TEXT AS $$
 <a name="l00344"></a>00344 DECLARE
 <a name="l00345"></a>00345     curstmt    TEXT;
 <a name="l00346"></a>00346     name       TEXT;
 <a name="l00347"></a>00347 BEGIN
 <a name="l00348"></a>00348     curstmt = MADLIB_SCHEMA.__format
 <a name="l00349"></a>00349         (
 <a name="l00350"></a>00350             &#39;SELECT MADLIB_SCHEMA.__regclass_to_text(result_table_oid::regclass)
 <a name="l00351"></a>00351              FROM MADLIB_SCHEMA.training_info
 <a name="l00352"></a>00352              WHERE training_encoded_table_oid = &#39;&#39;%&#39;&#39;::regclass
 <a name="l00353"></a>00353              LIMIT 1&#39;,
 <a name="l00354"></a>00354             enc_table_name
 <a name="l00355"></a>00355         );
 <a name="l00356"></a>00356     EXECUTE curstmt INTO name;
 <a name="l00357"></a>00357
 <a name="l00358"></a>00358     RETURN name;
 <a name="l00359"></a>00359 END
 <a name="l00360"></a>00360 $$ LANGUAGE PLPGSQL;
 <a name="l00361"></a>00361
 <a name="l00362"></a>00362
 <a name="l00363"></a>00363 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__best_scv_sfunc
 <a name="l00364"></a>00364     (
 <a name="l00365"></a>00365     result      FLOAT8[],    -- intermediate result
 <a name="l00366"></a>00366     scv         FLOAT8[],
 <a name="l00367"></a>00367     fid         INT,
 <a name="l00368"></a>00368     split_value FLOAT8
 <a name="l00369"></a>00369     )
 <a name="l00370"></a>00370 RETURNS FLOAT8[]
 <a name="l00371"></a>00371 AS &#39;MODULE_PATHNAME&#39;, &#39;dt_best_scv_sfunc&#39;
 <a name="l00372"></a>00372 LANGUAGE C STRICT IMMUTABLE;
 <a name="l00373"></a>00373
 <a name="l00374"></a>00374
 <a name="l00375"></a>00375 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__best_scv_prefunc
 <a name="l00376"></a>00376     (
 <a name="l00377"></a>00377     sfunc1_result     FLOAT8[],
 <a name="l00378"></a>00378     sfunc2_result     FLOAT8[]
 <a name="l00379"></a>00379     )
 <a name="l00380"></a>00380 RETURNS FLOAT8[]
 <a name="l00381"></a>00381 AS &#39;MODULE_PATHNAME&#39;, &#39;dt_best_scv_prefunc&#39;
 <a name="l00382"></a>00382 LANGUAGE C STRICT IMMUTABLE;
 <a name="l00383"></a>00383
 <a name="l00384"></a>00384
 <a name="l00385"></a>00385 DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__best_scv_aggr
 <a name="l00386"></a>00386     (
 <a name="l00387"></a>00387     FLOAT8[],       -- scv
 <a name="l00388"></a>00388     INT,            -- fid
 <a name="l00389"></a>00389     FLOAT8          -- split_value
 <a name="l00390"></a>00390     ) CASCADE;
 <a name="l00391"></a>00391 CREATE
 <a name="l00392"></a>00392 AGGREGATE MADLIB_SCHEMA.__best_scv_aggr
 <a name="l00393"></a>00393     (
 <a name="l00394"></a>00394     FLOAT8[],       -- scv
 <a name="l00395"></a>00395     INT,            -- fid
 <a name="l00396"></a>00396     FLOAT8          -- split_value
 <a name="l00397"></a>00397     )
 <a name="l00398"></a>00398 (
 <a name="l00399"></a>00399   SFUNC=MADLIB_SCHEMA.__best_scv_sfunc,
 <a name="l00400"></a>00400   m4_ifdef(`__GREENPLUM__&#39;, `prefunc=MADLIB_SCHEMA.__best_scv_prefunc,&#39;)
 <a name="l00401"></a>00401   STYPE=FLOAT8[],
 <a name="l00402"></a>00402   initcond = &#39;{0, 0, 0, 0, 0, 0, 0}<span class="stringliteral">&#39;</span>
 <a name="l00403"></a>00403 <span class="stringliteral">);</span>
 <a name="l00404"></a>00404 <span class="stringliteral"></span>
 <a name="l00405"></a>00405 <span class="stringliteral"></span>
 <a name="l00406"></a>00406 <span class="stringliteral">/*</span>
 <a name="l00407"></a>00407 <span class="stringliteral"> * @brief The step function is defined to process each record in the ACS set. </span>
 <a name="l00408"></a>00408 <span class="stringliteral"> *        The records have this format: </span>
 <a name="l00409"></a>00409 <span class="stringliteral"> *        {fid, fval, is_cont, split_value, le, total, tid, nid}</span>
 <a name="l00410"></a>00410 <span class="stringliteral"> *</span>
 <a name="l00411"></a>00411 <span class="stringliteral"> * @param result            The array used to keep the best attribute&#39;</span>s info.
 <a name="l00412"></a>00412  * @param sc_code           The code of the split criterion.
 <a name="l00413"></a>00413  * @param is_cont           True  - The feature is continuous.
 <a name="l00414"></a>00414  *                          False - The feature is discrete.
 <a name="l00415"></a>00415  * @param num_class         The total number of classes.
 <a name="l00416"></a>00416  * @param le_array          The le component of the ACS record. le_array[i] is the
 <a name="l00417"></a>00417  *                          number of samples whose <span class="keyword">class </span>code equals to i and
 <a name="l00418"></a>00418  *                          whose fval is less-than or equal to the fval component
 <a name="l00419"></a>00419  *                          of the ACS record being processed.
 <a name="l00420"></a>00420  * @param total_array       The total component of the ACS record. total_array[i] is
 <a name="l00421"></a>00421  *                          the number of samples whose <span class="keyword">class </span>code equals to i.
 <a name="l00422"></a>00422  * @param true_total        The real total number of samples currently assigned to
 <a name="l00423"></a>00423  *                          the node identified by (tid, nid). If there are missing
 <a name="l00424"></a>00424  *                          values in fval, the sum of all elements in total_array
 <a name="l00425"></a>00425  *                          will be less than true_total.
 <a name="l00426"></a>00426  *
 <a name="l00427"></a>00427  * @<span class="keywordflow">return</span> A 9-element array. Please refer to the definition of SCV_STATE_ARRAY_INDEX
 <a name="l00428"></a>00428  *         in dt.c <span class="keywordflow">for</span> the detailed information of <span class="keyword">this</span> array.
 <a name="l00429"></a>00429  *
 <a name="l00430"></a>00430  */
 <a name="l00431"></a>00431 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__scv_aggr_sfunc
 <a name="l00432"></a>00432     (
 <a name="l00433"></a>00433     result          FLOAT8[],
 <a name="l00434"></a>00434     sc_code         INT,
 <a name="l00435"></a>00435     is_cont         BOOLEAN,
 <a name="l00436"></a>00436     num_class       INT,
 <a name="l00437"></a>00437     le_array        FLOAT8[],
 <a name="l00438"></a>00438     total_array     FLOAT8[],
 <a name="l00439"></a>00439     true_total      BIGINT
 <a name="l00440"></a>00440     )
 <a name="l00441"></a>00441 RETURNS FLOAT8[]
 <a name="l00442"></a>00442 AS <span class="stringliteral">&#39;MODULE_PATHNAME&#39;</span>, <span class="stringliteral">&#39;dt_scv_aggr_sfunc&#39;</span>
 <a name="l00443"></a>00443 LANGUAGE C IMMUTABLE;
 <a name="l00444"></a>00444
 <a name="l00445"></a>00445
 <a name="l00446"></a>00446 <span class="comment">/*</span>
 <a name="l00447"></a>00447 <span class="comment"> * @brief The pre-function for the aggregation of splitting criteria values. It  </span>
 <a name="l00448"></a>00448 <span class="comment"> *        takes the state array produced by two sfunc and combine them together.</span>
 <a name="l00449"></a>00449 <span class="comment"> *</span>
 <a name="l00450"></a>00450 <span class="comment"> * @param sfunc1_result     The array from sfunc1.</span>
 <a name="l00451"></a>00451 <span class="comment"> * @param sfunc2_result     The array from sfunc2.</span>
 <a name="l00452"></a>00452 <span class="comment"> *</span>
 <a name="l00453"></a>00453 <span class="comment"> * @return A 9-element array. Please refer to the definition of SCV_STATE_ARRAY_INDEX</span>
 <a name="l00454"></a>00454 <span class="comment"> *         in dt.c for the detailed information of this array.</span>
 <a name="l00455"></a>00455 <span class="comment"> *</span>
 <a name="l00456"></a>00456 <span class="comment"> */</span>
 <a name="l00457"></a>00457 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__scv_aggr_prefunc
 <a name="l00458"></a>00458     (
 <a name="l00459"></a>00459     sfunc1_result     FLOAT8[],
 <a name="l00460"></a>00460     sfunc2_result     FLOAT8[]
 <a name="l00461"></a>00461     )
 <a name="l00462"></a>00462 RETURNS FLOAT8[]
 <a name="l00463"></a>00463 AS <span class="stringliteral">&#39;MODULE_PATHNAME&#39;</span>, <span class="stringliteral">&#39;dt_scv_aggr_prefunc&#39;</span>
 <a name="l00464"></a>00464 LANGUAGE C STRICT IMMUTABLE;
 <a name="l00465"></a>00465
 <a name="l00466"></a>00466
 <a name="l00467"></a>00467 <span class="comment">/*</span>
 <a name="l00468"></a>00468 <span class="comment"> * @brief The final function for the aggregation of splitting criteria values.</span>
 <a name="l00469"></a>00469 <span class="comment"> *        It takes the state array produced by the sfunc and produces a</span>
 <a name="l00470"></a>00470 <span class="comment"> *        5-element array.</span>
 <a name="l00471"></a>00471 <span class="comment"> *</span>
 <a name="l00472"></a>00472 <span class="comment"> * @param internal_result   The 9-element array produced by dt_scv_aggr_prefunc</span>
 <a name="l00473"></a>00473 <span class="comment"> *</span>
 <a name="l00474"></a>00474 <span class="comment"> * @return A 5-element array. Please refer to the definition of SCV_FINAL_ARRAY_INDEX</span>
 <a name="l00475"></a>00475 <span class="comment"> *         in dt.c for the detailed information of this array.</span>
 <a name="l00476"></a>00476 <span class="comment"> *</span>
 <a name="l00477"></a>00477 <span class="comment"> */</span>
 <a name="l00478"></a>00478 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__scv_aggr_ffunc
 <a name="l00479"></a>00479     (
 <a name="l00480"></a>00480     internal_result     FLOAT8[]
 <a name="l00481"></a>00481     )
 <a name="l00482"></a>00482 RETURNS FLOAT8[]
 <a name="l00483"></a>00483 AS <span class="stringliteral">&#39;MODULE_PATHNAME&#39;</span>, <span class="stringliteral">&#39;dt_scv_aggr_ffunc&#39;</span>
 <a name="l00484"></a>00484 LANGUAGE C STRICT IMMUTABLE;
 <a name="l00485"></a>00485
 <a name="l00486"></a>00486
 <a name="l00487"></a>00487 DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__scv_aggr
 <a name="l00488"></a>00488     (
 <a name="l00489"></a>00489     INT,        -- sc
 <a name="l00490"></a>00490     BOOLEAN,    -- is_cont
 <a name="l00491"></a>00491     INT,        -- total number of classes
 <a name="l00492"></a>00492     FLOAT8[],   -- le array
 <a name="l00493"></a>00493     FLOAT8[],   -- total count array
 <a name="l00494"></a>00494     BIGINT      -- the total number of samples
 <a name="l00495"></a>00495     ) CASCADE;
 <a name="l00496"></a>00496 CREATE
 <a name="l00497"></a>00497 AGGREGATE MADLIB_SCHEMA.__scv_aggr
 <a name="l00498"></a>00498     (
 <a name="l00499"></a>00499     INT,        -- sc
 <a name="l00500"></a>00500     BOOLEAN,    -- is_cont
 <a name="l00501"></a>00501     INT,        -- total number of classes
 <a name="l00502"></a>00502     FLOAT8[],   -- le array
 <a name="l00503"></a>00503     FLOAT8[],   -- total count array
 <a name="l00504"></a>00504     BIGINT      -- the total number of samples
 <a name="l00505"></a>00505     )
 <a name="l00506"></a>00506 (
 <a name="l00507"></a>00507   SFUNC=MADLIB_SCHEMA.__scv_aggr_sfunc,
 <a name="l00508"></a>00508   m4_ifdef(`__GREENPLUM__<span class="stringliteral">&#39;, `prefunc=MADLIB_SCHEMA.__scv_aggr_prefunc,&#39;</span>)
 <a name="l00509"></a>00509   FINALFUNC=MADLIB_SCHEMA.__scv_aggr_ffunc,
 <a name="l00510"></a>00510   STYPE=FLOAT8[],
 <a name="l00511"></a>00511   initcond = &#39;{0, 0, 0, 0, 0, 0, 0, 0, 0}<span class="stringliteral">&#39;</span>
 <a name="l00512"></a>00512 <span class="stringliteral">  -- 1   sc: 1 infogain, 2 gainratio, 3 gini</span>
 <a name="l00513"></a>00513 <span class="stringliteral">  -- 2   is_cont</span>
 <a name="l00514"></a>00514 <span class="stringliteral">  -- 3   scv_class_info</span>
 <a name="l00515"></a>00515 <span class="stringliteral">  -- 4   scv_attr_info</span>
 <a name="l00516"></a>00516 <span class="stringliteral">  -- 5   scv_class_attr_info</span>
 <a name="l00517"></a>00517 <span class="stringliteral">  -- 6   scv_count</span>
 <a name="l00518"></a>00518 <span class="stringliteral">  -- 7   scv_total</span>
 <a name="l00519"></a>00519 <span class="stringliteral">  -- 8   max_class_id</span>
 <a name="l00520"></a>00520 <span class="stringliteral">  -- 9   max_class_count</span>
 <a name="l00521"></a>00521 <span class="stringliteral">);</span>
 <a name="l00522"></a>00522 <span class="stringliteral"></span>
 <a name="l00523"></a>00523 <span class="stringliteral"></span>
 <a name="l00524"></a>00524 <span class="stringliteral">/*</span>
 <a name="l00525"></a>00525 <span class="stringliteral"> * @brief Retrieve the specified number of unique features for a node.</span>
 <a name="l00526"></a>00526 <span class="stringliteral"> *        Discrete features used by ancestor nodes will be excluded.</span>
 <a name="l00527"></a>00527 <span class="stringliteral"> *        If the number of remaining features is less or equal than the</span>
 <a name="l00528"></a>00528 <span class="stringliteral"> *        requested number of features, then all the remaining features</span>
 <a name="l00529"></a>00529 <span class="stringliteral"> *        will be returned. Otherwise, we will sample the requested </span>
 <a name="l00530"></a>00530 <span class="stringliteral"> *        number of features from the remaining features.</span>
 <a name="l00531"></a>00531 <span class="stringliteral"> *</span>
 <a name="l00532"></a>00532 <span class="stringliteral"> * @param num_req_features  The number of requested features.</span>
 <a name="l00533"></a>00533 <span class="stringliteral"> * @param num_features      The total number of features.</span>
 <a name="l00534"></a>00534 <span class="stringliteral"> * @param nid               The ID of the node for which the</span>
 <a name="l00535"></a>00535 <span class="stringliteral"> *                          features are sampled.</span>
 <a name="l00536"></a>00536 <span class="stringliteral"> * @param dp_fids           The IDs of the discrete features</span>
 <a name="l00537"></a>00537 <span class="stringliteral"> *                          used by the ancestors.</span>
 <a name="l00538"></a>00538 <span class="stringliteral"> *</span>
 <a name="l00539"></a>00539 <span class="stringliteral"> * @return An array containing all the IDs of chosen features.</span>
 <a name="l00540"></a>00540 <span class="stringliteral"> *</span>
 <a name="l00541"></a>00541 <span class="stringliteral"> */</span>
 <a name="l00542"></a>00542 <span class="stringliteral">CREATE OR REPLACE FUNCTION </span>
 <a name="l00543"></a>00543 <span class="stringliteral">MADLIB_SCHEMA.__dt_get_node_split_fids(INT4, INT4, INT4, INT4[])</span>
 <a name="l00544"></a>00544 <span class="stringliteral">RETURNS INT[]</span>
 <a name="l00545"></a>00545 <span class="stringliteral">AS &#39;</span>MODULE_PATHNAME<span class="stringliteral">&#39;, &#39;</span>dt_get_node_split_fids<span class="stringliteral">&#39;</span>
 <a name="l00546"></a>00546 <span class="stringliteral">LANGUAGE C VOLATILE;</span>
 <a name="l00547"></a>00547 <span class="stringliteral"></span>
 <a name="l00548"></a>00548 <span class="stringliteral"></span>
 <a name="l00549"></a>00549 <span class="stringliteral">/*</span>
 <a name="l00550"></a>00550 <span class="stringliteral"> * @brief Retrieve the selected features for a node. We will create a table, named </span>
 <a name="l00551"></a>00551 <span class="stringliteral"> *        sf_association, to store the association between selected feature IDs and</span>
 <a name="l00552"></a>00552 <span class="stringliteral"> *        node IDs.</span>
 <a name="l00553"></a>00553 <span class="stringliteral"> *</span>
 <a name="l00554"></a>00554 <span class="stringliteral"> * @param nid_table_name    The full name of the table which contains all the </span>
 <a name="l00555"></a>00555 <span class="stringliteral"> *                          node IDs.</span>
 <a name="l00556"></a>00556 <span class="stringliteral"> * @param result_table_name The full name of the table which contains the parent</span>
 <a name="l00557"></a>00557 <span class="stringliteral"> *                          discrete features for each node.</span>
 <a name="l00558"></a>00558 <span class="stringliteral"> * @param num_chosen_fids   The number of feature IDs will be chosen for a node.</span>
 <a name="l00559"></a>00559 <span class="stringliteral"> * @param total_num_fids    The total number of feature IDs, total_num_fids </span>
 <a name="l00560"></a>00560 <span class="stringliteral"> *                          &gt;= num_chosen_fids.</span>
 <a name="l00561"></a>00561 <span class="stringliteral"> *                          If num_chosen_fids &lt; total_num_fids, then we will </span>
 <a name="l00562"></a>00562 <span class="stringliteral"> *                          randomly select num_chosen_fids features from all</span>
 <a name="l00563"></a>00563 <span class="stringliteral"> *                          the features. Otherwise, we will return all the  </span>
 <a name="l00564"></a>00564 <span class="stringliteral"> *                          features exception they belong to the parent discrete</span>
 <a name="l00565"></a>00565 <span class="stringliteral"> *                          features for a node.</span>
 <a name="l00566"></a>00566 <span class="stringliteral"> * @param verbosity         &gt; 0 means this function runs in verbose mode.</span>
 <a name="l00567"></a>00567 <span class="stringliteral"> *                    </span>
 <a name="l00568"></a>00568 <span class="stringliteral"> * @return An constant string for the association table name.</span>
 <a name="l00569"></a>00569 <span class="stringliteral"> *</span>
 <a name="l00570"></a>00570 <span class="stringliteral"> */</span>
 <a name="l00571"></a>00571 <span class="stringliteral">CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_features_of_nodes</span>
 <a name="l00572"></a>00572 <span class="stringliteral">    (</span>
 <a name="l00573"></a>00573 <span class="stringliteral">    nid_table_name        TEXT,</span>
 <a name="l00574"></a>00574 <span class="stringliteral">    result_table_name     TEXT,</span>
 <a name="l00575"></a>00575 <span class="stringliteral">    num_chosen_fids       INT,</span>
 <a name="l00576"></a>00576 <span class="stringliteral">    total_num_fids        INT,</span>
 <a name="l00577"></a>00577 <span class="stringliteral">    verbosity             INT</span>
 <a name="l00578"></a>00578 <span class="stringliteral">    )</span>
 <a name="l00579"></a>00579 <span class="stringliteral">RETURNS TEXT AS $$</span>
 <a name="l00580"></a>00580 <span class="stringliteral">DECLARE</span>
 <a name="l00581"></a>00581 <span class="stringliteral">    curstmt     TEXT;</span>
 <a name="l00582"></a>00582 <span class="stringliteral">BEGIN</span>
 <a name="l00583"></a>00583 <span class="stringliteral">    -- The sf_association table records which features are used</span>
 <a name="l00584"></a>00584 <span class="stringliteral">    -- for finding the best split for a node.</span>
 <a name="l00585"></a>00585 <span class="stringliteral">    -- It has two columns:</span>
 <a name="l00586"></a>00586 <span class="stringliteral">    --      nid -- The id of a node.</span>
 <a name="l00587"></a>00587 <span class="stringliteral">    --      fid -- The id of a feature.</span>
 <a name="l00588"></a>00588 <span class="stringliteral">    EXECUTE &#39;</span>TRUNCATE sf_assoc<span class="stringliteral">&#39;;</span>
 <a name="l00589"></a>00589 <span class="stringliteral">    </span>
 <a name="l00590"></a>00590 <span class="stringliteral">    curstmt = MADLIB_SCHEMA.__format</span>
 <a name="l00591"></a>00591 <span class="stringliteral">                (</span>
 <a name="l00592"></a>00592 <span class="stringliteral">                    &#39;</span>INSERT INTO sf_assoc(nid, fid)
 <a name="l00593"></a>00593                      SELECT
 <a name="l00594"></a>00594                        nid,
 <a name="l00595"></a>00595                        unnest(MADLIB_SCHEMA.__dt_get_node_split_fids(%, %,
 <a name="l00596"></a>00596                                 nid,dp_ids)) as fid
 <a name="l00597"></a>00597                      FROM (SELECT nid, dp_ids
 <a name="l00598"></a>00598                            FROM % s1, % s2
 <a name="l00599"></a>00599                            WHERE s1.nid = s2.<span class="keywordtype">id</span>
 <a name="l00600"></a>00600                            GROUP BY nid, dp_ids) t&#39;,
 <a name="l00601"></a>00601                     ARRAY[
 <a name="l00602"></a>00602                         num_chosen_fids::TEXT,
 <a name="l00603"></a>00603                         total_num_fids::TEXT,
 <a name="l00604"></a>00604                         nid_table_name,
 <a name="l00605"></a>00605                         result_table_name
 <a name="l00606"></a>00606                         ]
 <a name="l00607"></a>00607                 );
 <a name="l00608"></a>00608
 <a name="l00609"></a>00609      IF (verbosity &gt; 0) THEN
 <a name="l00610"></a>00610         RAISE INFO &#39;build sample feature association stmt: %&#39;, curstmt;
 <a name="l00611"></a>00611      END IF;
 <a name="l00612"></a>00612
 <a name="l00613"></a>00613      EXECUTE curstmt;
 <a name="l00614"></a>00614
 <a name="l00615"></a>00615      -- we return an constant <span class="keywordtype">string</span> for the association table name
 <a name="l00616"></a>00616      return &#39;sf_assoc&#39;;
 <a name="l00617"></a>00617
 <a name="l00618"></a>00618 END
 <a name="l00619"></a>00619 $$ LANGUAGE PLPGSQL;
 <a name="l00620"></a>00620
 <a name="l00621"></a>00621
 <a name="l00622"></a>00622 <span class="comment">/*</span>
 <a name="l00623"></a>00623 <span class="comment"> * This UDT is used to keep the times of generating acc.</span>
 <a name="l00624"></a>00624 <span class="comment"> *</span>
 <a name="l00625"></a>00625 <span class="comment"> * calc_pre_time   The time of pre-processing.</span>
 <a name="l00626"></a>00626 <span class="comment"> * calc_acc_time   The time of calculating acc.</span>
 <a name="l00627"></a>00627 <span class="comment"> *</span>
 <a name="l00628"></a>00628 <span class="comment"> */</span>
 <a name="l00629"></a>00629 DROP TYPE IF EXISTS MADLIB_SCHEMA.__gen_acc_time;
 <a name="l00630"></a>00630 CREATE TYPE MADLIB_SCHEMA.__gen_acc_time AS
 <a name="l00631"></a>00631 (
 <a name="l00632"></a>00632     calc_pre_time       INTERVAL,
 <a name="l00633"></a>00633     calc_acc_time       INTERVAL
 <a name="l00634"></a>00634 );
 <a name="l00635"></a>00635
 <a name="l00636"></a>00636
 <a name="l00637"></a>00637 <span class="comment">/*</span>
 <a name="l00638"></a>00638 <span class="comment"> * @brief Generate the ACC for current leaf nodes.</span>
 <a name="l00639"></a>00639 <span class="comment"> *</span>
 <a name="l00640"></a>00640 <span class="comment"> * @param encoded_table_name    The full name of the encoded table for the  </span>
 <a name="l00641"></a>00641 <span class="comment"> *                              training table.</span>
 <a name="l00642"></a>00642 <span class="comment"> * @param metatable_name        The full name of the metatable contains the  </span>
 <a name="l00643"></a>00643 <span class="comment"> *                              relevant information of the input table.</span>
 <a name="l00644"></a>00644 <span class="comment"> * @param result_table_name     The full name of the training result table.</span>
 <a name="l00645"></a>00645 <span class="comment"> * @param num_featrue_try       The number of features will be chosen per node. </span>
 <a name="l00646"></a>00646 <span class="comment"> * @param num_classes           Total number of classes in training set.</span>
 <a name="l00647"></a>00647 <span class="comment"> * @param verbosity             &gt; 0 means this function runs in verbose mode. </span>
 <a name="l00648"></a>00648 <span class="comment"> *                    </span>
 <a name="l00649"></a>00649 <span class="comment"> * @return The time information for generating ACC.</span>
 <a name="l00650"></a>00650 <span class="comment"> *</span>
 <a name="l00651"></a>00651 <span class="comment"> */</span>
 <a name="l00652"></a>00652 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__gen_acc
 <a name="l00653"></a>00653     (
 <a name="l00654"></a>00654     encoded_table_name      TEXT,
 <a name="l00655"></a>00655     metatable_name          TEXT,
 <a name="l00656"></a>00656     result_table_name       TEXT,
 <a name="l00657"></a>00657     tr_table_name           TEXT,
 <a name="l00658"></a>00658     sf_table_name           TEXT,
 <a name="l00659"></a>00659     num_featrue_try         INT,
 <a name="l00660"></a>00660     num_classes             INT,
 <a name="l00661"></a>00661     sampling_needed         BOOLEAN,
 <a name="l00662"></a>00662     verbosity               INT
 <a name="l00663"></a>00663     )
 <a name="l00664"></a>00664 RETURNS MADLIB_SCHEMA.__gen_acc_time AS $$
 <a name="l00665"></a>00665 DECLARE
 <a name="l00666"></a>00666     curstmt             TEXT := &#39;&#39;;
 <a name="l00667"></a>00667     num_fids            INT  := 1;
 <a name="l00668"></a>00668     begin_calc_acc      TIMESTAMP;
 <a name="l00669"></a>00669     begin_calc_pre      TIMESTAMP;
 <a name="l00670"></a>00670     ret                 MADLIB_SCHEMA.__gen_acc_time;
 <a name="l00671"></a>00671     select_stmt         TEXT;
 <a name="l00672"></a>00672 BEGIN
 <a name="l00673"></a>00673     begin_calc_pre = clock_timestamp();
 <a name="l00674"></a>00674
 <a name="l00675"></a>00675     -- get the number of features
 <a name="l00676"></a>00676     curstmt = MADLIB_SCHEMA.__format
 <a name="l00677"></a>00677                 (
 <a name="l00678"></a>00678                 &#39;SELECT COUNT(<span class="keywordtype">id</span>)
 <a name="l00679"></a>00679                 FROM %
 <a name="l00680"></a>00680                 WHERE column_type = &#39;&#39;f&#39;&#39;&#39;,
 <a name="l00681"></a>00681                 metatable_name
 <a name="l00682"></a>00682                 );
 <a name="l00683"></a>00683     EXECUTE curstmt INTO num_fids;
 <a name="l00684"></a>00684
 <a name="l00685"></a>00685     -- preprocessing time
 <a name="l00686"></a>00686     ret.calc_pre_time = clock_timestamp() - begin_calc_pre;
 <a name="l00687"></a>00687     begin_calc_acc    = clock_timestamp();
 <a name="l00688"></a>00688
 <a name="l00689"></a>00689     IF (sampling_needed) THEN
 <a name="l00690"></a>00690         PERFORM MADLIB_SCHEMA.__get_features_of_nodes
 <a name="l00691"></a>00691             (
 <a name="l00692"></a>00692                 tr_table_name,
 <a name="l00693"></a>00693                 result_table_name,
 <a name="l00694"></a>00694                 num_featrue_try,
 <a name="l00695"></a>00695                 num_fids,
 <a name="l00696"></a>00696                 verbosity
 <a name="l00697"></a>00697             );
 <a name="l00698"></a>00698
 <a name="l00699"></a>00699         select_stmt =  MADLIB_SCHEMA.__format
 <a name="l00700"></a>00700              (
 <a name="l00701"></a>00701                 &#39;SELECT tr.tid, tr.nid, ed.fid, ed.fval, ed.is_cont,
 <a name="l00702"></a>00702                         ed.class, sum(weight) as count
 <a name="l00703"></a>00703                  FROM % ed, % tr, % sf
 <a name="l00704"></a>00704                  WHERE tr.nid = sf.nid AND ed.fid = sf.fid AND ed.<span class="keywordtype">id</span> = tr.<span class="keywordtype">id</span>
 <a name="l00705"></a>00705                  GROUP BY   tr.tid, tr.nid, ed.fid, ed.fval,
 <a name="l00706"></a>00706                             ed.is_cont, ed.class&#39;,
 <a name="l00707"></a>00707                ARRAY[
 <a name="l00708"></a>00708                    encoded_table_name,
 <a name="l00709"></a>00709                    tr_table_name,
 <a name="l00710"></a>00710                    sf_table_name
 <a name="l00711"></a>00711                ]
 <a name="l00712"></a>00712             );
 <a name="l00713"></a>00713     ELSE
 <a name="l00714"></a>00714         select_stmt =  MADLIB_SCHEMA.__format
 <a name="l00715"></a>00715              (
 <a name="l00716"></a>00716                 &#39;SELECT tr.tid, tr.nid, ed.fid, ed.fval, ed.is_cont,
 <a name="l00717"></a>00717                         ed.class, sum(weight) as count
 <a name="l00718"></a>00718                  FROM % ed, % tr
 <a name="l00719"></a>00719                  WHERE ed.<span class="keywordtype">id</span> = tr.<span class="keywordtype">id</span>
 <a name="l00720"></a>00720                  GROUP BY   tr.tid, tr.nid, ed.fid, ed.fval,
 <a name="l00721"></a>00721                             ed.is_cont, ed.class&#39;,
 <a name="l00722"></a>00722                ARRAY[
 <a name="l00723"></a>00723                    encoded_table_name,
 <a name="l00724"></a>00724                    tr_table_name
 <a name="l00725"></a>00725                ]
 <a name="l00726"></a>00726             );
 <a name="l00727"></a>00727     END IF;
 <a name="l00728"></a>00728     DROP TABLE IF EXISTS training_instance_aux;
 <a name="l00729"></a>00729     curstmt = MADLIB_SCHEMA.__format
 <a name="l00730"></a>00730         (
 <a name="l00731"></a>00731             &#39;CREATE TEMP TABLE training_instance_aux AS
 <a name="l00732"></a>00732              SELECT tid, nid, fid, fval, is_cont,
 <a name="l00733"></a>00733                     MADLIB_SCHEMA.__dt_acc_count_aggr
 <a name="l00734"></a>00734                         (%,count::BIGINT,class::INT) AS count
 <a name="l00735"></a>00735              FROM
 <a name="l00736"></a>00736              (
 <a name="l00737"></a>00737                  %
 <a name="l00738"></a>00738              ) l
 <a name="l00739"></a>00739              GROUP BY tid,nid,fid, fval,is_cont
 <a name="l00740"></a>00740              m4_ifdef(`__GREENPLUM__&#39;, `DISTRIBUTED BY (fid, fval)&#39;)&#39;,
 <a name="l00741"></a>00741             ARRAY[
 <a name="l00742"></a>00742                 num_classes::TEXT,
 <a name="l00743"></a>00743                 select_stmt
 <a name="l00744"></a>00744             ]
 <a name="l00745"></a>00745         );
 <a name="l00746"></a>00746
 <a name="l00747"></a>00747     IF ( verbosity&gt;0 ) THEN
 <a name="l00748"></a>00748         RAISE INFO &#39;%&#39;, curstmt;
 <a name="l00749"></a>00749     END IF;
 <a name="l00750"></a>00750
 <a name="l00751"></a>00751     EXECUTE curstmt;
 <a name="l00752"></a>00752     ret.calc_acc_time = clock_timestamp() - begin_calc_acc;
 <a name="l00753"></a>00753
 <a name="l00754"></a>00754     RETURN ret;
 <a name="l00755"></a>00755 END
 <a name="l00756"></a>00756 $$ LANGUAGE PLPGSQL;
 <a name="l00757"></a>00757
 <a name="l00758"></a>00758
 <a name="l00759"></a>00759 DROP TYPE IF EXISTS MADLIB_SCHEMA.__rep_type CASCADE;
 <a name="l00760"></a>00760 CREATE TYPE MADLIB_SCHEMA.__rep_type AS
 <a name="l00761"></a>00761     (
 <a name="l00762"></a>00762     numOfOrgClasses BIGINT[]
 <a name="l00763"></a>00763     );
 <a name="l00764"></a>00764
 <a name="l00765"></a>00765
 <a name="l00766"></a>00766 <span class="comment">/*</span>
 <a name="l00767"></a>00767 <span class="comment"> * @brief The step function for aggregating the class counts while doing Reduce </span>
 <a name="l00768"></a>00768 <span class="comment"> *        Error Pruning (REP).</span>
 <a name="l00769"></a>00769 <span class="comment"> *</span>
 <a name="l00770"></a>00770 <span class="comment"> * @param class_count_array     The array used to store the accumulated information.</span>
 <a name="l00771"></a>00771 <span class="comment"> *                              [0]: the total number of mis-classified samples.</span>
 <a name="l00772"></a>00772 <span class="comment"> *                              [i]: the number of samples belonging to the ith class.</span>
 <a name="l00773"></a>00773 <span class="comment"> * @param classified_class      The predicted class based on our trained DT model.</span>
 <a name="l00774"></a>00774 <span class="comment"> * @param original_class        The real class value provided in the validation set.</span>
 <a name="l00775"></a>00775 <span class="comment"> * @param max_num_of_classes    The total number of distinct class values. </span>
 <a name="l00776"></a>00776 <span class="comment"> *                    </span>
 <a name="l00777"></a>00777 <span class="comment"> * @return An updated class count array.</span>
 <a name="l00778"></a>00778 <span class="comment"> *</span>
 <a name="l00779"></a>00779 <span class="comment"> */</span>
 <a name="l00780"></a>00780 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__rep_aggr_class_count_sfunc
 <a name="l00781"></a>00781     (
 <a name="l00782"></a>00782     class_count_array       BIGINT[],
 <a name="l00783"></a>00783     classified_class        INT,
 <a name="l00784"></a>00784     original_class          INT,
 <a name="l00785"></a>00785     max_num_of_classes      INT
 <a name="l00786"></a>00786     )
 <a name="l00787"></a>00787 RETURNS BIGINT[]
 <a name="l00788"></a>00788 AS &#39;MODULE_PATHNAME&#39;, &#39;dt_rep_aggr_class_count_sfunc&#39;
 <a name="l00789"></a>00789 LANGUAGE C IMMUTABLE;
 <a name="l00790"></a>00790
 <a name="l00791"></a>00791
 <a name="l00792"></a>00792 <span class="comment">/*</span>
 <a name="l00793"></a>00793 <span class="comment"> * @brief Add the corresponding elements of the input arrays </span>
 <a name="l00794"></a>00794 <span class="comment"> *        to create a new one.</span>
 <a name="l00795"></a>00795 <span class="comment"> *</span>
 <a name="l00796"></a>00796 <span class="comment"> * @param 1 arg     The array 1.</span>
 <a name="l00797"></a>00797 <span class="comment"> * @param 2 arg     The array 2.</span>
 <a name="l00798"></a>00798 <span class="comment"> *                    </span>
 <a name="l00799"></a>00799 <span class="comment"> * @return The new array.</span>
 <a name="l00800"></a>00800 <span class="comment"> *</span>
 <a name="l00801"></a>00801 <span class="comment"> */</span>
 <a name="l00802"></a>00802 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__bigint_array_add
 <a name="l00803"></a>00803     (
 <a name="l00804"></a>00804     BIGINT[],
 <a name="l00805"></a>00805     BIGINT[]
 <a name="l00806"></a>00806     )
 <a name="l00807"></a>00807 RETURNS BIGINT[]
 <a name="l00808"></a>00808 AS &#39;MODULE_PATHNAME&#39;, &#39;bigint_array_add&#39;
 <a name="l00809"></a>00809 LANGUAGE C IMMUTABLE;
 <a name="l00810"></a>00810
 <a name="l00811"></a>00811
 <a name="l00812"></a>00812 <span class="comment">/*</span>
 <a name="l00813"></a>00813 <span class="comment"> * @brief The final function for aggregating the class counts for REP. </span>
 <a name="l00814"></a>00814 <span class="comment"> *        It takes the class count array produced by the sfunc and produces a </span>
 <a name="l00815"></a>00815 <span class="comment"> *        two-element array. The first element is the ID of the class that has </span>
 <a name="l00816"></a>00816 <span class="comment"> *        the maximum number of samples represented by the root node of the subtree</span>
 <a name="l00817"></a>00817 <span class="comment"> *        being processed. The second element is the number of reduced  </span>
 <a name="l00818"></a>00818 <span class="comment"> *        misclassified samples if the leave nodes of the subtree are pruned.</span>
 <a name="l00819"></a>00819 <span class="comment"> *</span>
 <a name="l00820"></a>00820 <span class="comment"> * @param class_count_data     The array containing all the information for the </span>
 <a name="l00821"></a>00821 <span class="comment"> *                             calculation of Reduced-Error pruning. </span>
 <a name="l00822"></a>00822 <span class="comment"> *                    </span>
 <a name="l00823"></a>00823 <span class="comment"> * @return A two element array.</span>
 <a name="l00824"></a>00824 <span class="comment"> *</span>
 <a name="l00825"></a>00825 <span class="comment"> */</span>
 <a name="l00826"></a>00826 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__rep_aggr_class_count_ffunc
 <a name="l00827"></a>00827     (
 <a name="l00828"></a>00828     class_count_array       BIGINT[]
 <a name="l00829"></a>00829     )
 <a name="l00830"></a>00830 RETURNS BIGINT[]
 <a name="l00831"></a>00831 AS &#39;MODULE_PATHNAME&#39;, &#39;dt_rep_aggr_class_count_ffunc&#39;
 <a name="l00832"></a>00832 LANGUAGE C STRICT IMMUTABLE;
 <a name="l00833"></a>00833
 <a name="l00834"></a>00834
 <a name="l00835"></a>00835 DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__rep_aggr_class_count
 <a name="l00836"></a>00836     (
 <a name="l00837"></a>00837     INT,
 <a name="l00838"></a>00838     INT,
 <a name="l00839"></a>00839     INT
 <a name="l00840"></a>00840     );
 <a name="l00841"></a>00841 CREATE AGGREGATE MADLIB_SCHEMA.__rep_aggr_class_count
 <a name="l00842"></a>00842     (
 <a name="l00843"></a>00843     INT,
 <a name="l00844"></a>00844     INT,
 <a name="l00845"></a>00845     INT
 <a name="l00846"></a>00846     )
 <a name="l00847"></a>00847 (
 <a name="l00848"></a>00848   SFUNC=MADLIB_SCHEMA.__rep_aggr_class_count_sfunc,
 <a name="l00849"></a>00849   m4_ifdef(`__GREENPLUM__&#39;, `prefunc=MADLIB_SCHEMA.__bigint_array_add,&#39;)
 <a name="l00850"></a>00850   FINALFUNC=MADLIB_SCHEMA.__rep_aggr_class_count_ffunc,
 <a name="l00851"></a>00851   STYPE=BIGINT[]
 <a name="l00852"></a>00852 );
 <a name="l00853"></a>00853
 <a name="l00854"></a>00854
 <a name="l00855"></a>00855 <span class="comment">/*</span>
 <a name="l00856"></a>00856 <span class="comment"> * @brief The step function of the aggregate __array_indexed_agg.</span>
 <a name="l00857"></a>00857 <span class="comment"> *</span>
 <a name="l00858"></a>00858 <span class="comment"> * @param state         The step state array of the aggregate function.</span>
 <a name="l00859"></a>00859 <span class="comment"> * @param elem          The element to be filled into the state array.</span>
 <a name="l00860"></a>00860 <span class="comment"> * @param elem_cnt      The number of elements.</span>
 <a name="l00861"></a>00861 <span class="comment"> * @param elem_idx      the subscript of &quot;elem&quot; in the state array.</span>
 <a name="l00862"></a>00862 <span class="comment"> * </span>
 <a name="l00863"></a>00863 <span class="comment"> */</span>
 <a name="l00864"></a>00864 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__array_indexed_agg_sfunc
 <a name="l00865"></a>00865     (
 <a name="l00866"></a>00866     state       float8[],
 <a name="l00867"></a>00867     elem        float8,
 <a name="l00868"></a>00868     elem_cnt    int8,
 <a name="l00869"></a>00869     elem_idx    int8
 <a name="l00870"></a>00870     )
 <a name="l00871"></a>00871 RETURNS float8[]
 <a name="l00872"></a>00872 AS &#39;MODULE_PATHNAME&#39;, &#39;dt_array_indexed_agg_sfunc&#39;
 <a name="l00873"></a>00873 LANGUAGE C IMMUTABLE;
 <a name="l00874"></a>00874
 <a name="l00875"></a>00875
 <a name="l00876"></a>00876 <span class="comment">/*</span>
 <a name="l00877"></a>00877 <span class="comment"> * @brief The Pre-function of the aggregate __array_indexed_agg.</span>
 <a name="l00878"></a>00878 <span class="comment"> * </span>
 <a name="l00879"></a>00879 <span class="comment"> * @param arg0  The first state array.</span>
 <a name="l00880"></a>00880 <span class="comment"> * @param arg1  The second state array.</span>
 <a name="l00881"></a>00881 <span class="comment"> *  </span>
 <a name="l00882"></a>00882 <span class="comment"> * @return The combined state.  </span>
 <a name="l00883"></a>00883 <span class="comment"> *</span>
 <a name="l00884"></a>00884 <span class="comment"> */</span>
 <a name="l00885"></a>00885 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__array_indexed_agg_prefunc
 <a name="l00886"></a>00886     (
 <a name="l00887"></a>00887     float8[],
 <a name="l00888"></a>00888     float8[]
 <a name="l00889"></a>00889     )
 <a name="l00890"></a>00890 RETURNS float8[]
 <a name="l00891"></a>00891 AS &#39;MODULE_PATHNAME&#39;, &#39;dt_array_indexed_agg_prefunc&#39;
 <a name="l00892"></a>00892 LANGUAGE C STRICT IMMUTABLE;
 <a name="l00893"></a>00893
 <a name="l00894"></a>00894
 <a name="l00895"></a>00895 <span class="comment">/*</span>
 <a name="l00896"></a>00896 <span class="comment"> * @brief The final function of __array_indexed_agg.</span>
 <a name="l00897"></a>00897 <span class="comment"> * </span>
 <a name="l00898"></a>00898 <span class="comment"> * @param state  The state array.</span>
 <a name="l00899"></a>00899 <span class="comment"> * </span>
 <a name="l00900"></a>00900 <span class="comment"> * @return The aggregate result.</span>
 <a name="l00901"></a>00901 <span class="comment"> *</span>
 <a name="l00902"></a>00902 <span class="comment"> */</span>
 <a name="l00903"></a>00903 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__array_indexed_agg_ffunc
 <a name="l00904"></a>00904     (
 <a name="l00905"></a>00905     float8[]
 <a name="l00906"></a>00906     )
 <a name="l00907"></a>00907 RETURNS float8[]
 <a name="l00908"></a>00908 AS &#39;MODULE_PATHNAME&#39;, &#39;dt_array_indexed_agg_ffunc&#39;
 <a name="l00909"></a>00909 LANGUAGE C IMMUTABLE;
 <a name="l00910"></a>00910
 <a name="l00911"></a>00911
 <a name="l00912"></a>00912 <span class="comment">/*</span>
 <a name="l00913"></a>00913 <span class="comment"> * @brief The aggregate is the same with array_agg, which will accumulate</span>
 <a name="l00914"></a>00914 <span class="comment"> *        The elements in each group to an array, except that we allow users </span>
 <a name="l00915"></a>00915 <span class="comment"> *        provide the subscript for each element. This aggregate will be </span>
 <a name="l00916"></a>00916 <span class="comment"> *        invoked as HashAggregate, while array_agg will be called as </span>
 <a name="l00917"></a>00917 <span class="comment"> *        GroupAggregate. Therefore, our implementation have better performance</span>
 <a name="l00918"></a>00918 <span class="comment"> *        than the array_agg.</span>
 <a name="l00919"></a>00919 <span class="comment"> * </span>
 <a name="l00920"></a>00920 <span class="comment"> * @param elem     The element to be fed into the returned array of this aggregate.</span>
 <a name="l00921"></a>00921 <span class="comment"> * @param elem_cnt The number of elements.</span>
 <a name="l00922"></a>00922 <span class="comment"> * @param elem_idx The subscript of the element.</span>
 <a name="l00923"></a>00923 <span class="comment"> *</span>
 <a name="l00924"></a>00924 <span class="comment"> * @return The aggregated array.</span>
 <a name="l00925"></a>00925 <span class="comment"> *</span>
 <a name="l00926"></a>00926 <span class="comment"> */</span>
 <a name="l00927"></a>00927 CREATE AGGREGATE MADLIB_SCHEMA.__array_indexed_agg(float8, int8, int8) (
 <a name="l00928"></a>00928     SFUNC = MADLIB_SCHEMA.__array_indexed_agg_sfunc,
 <a name="l00929"></a>00929     m4_ifdef( `__GREENPLUM__&#39;,`PREFUNC   = MADLIB_SCHEMA.__array_indexed_agg_prefunc,&#39;)
 <a name="l00930"></a>00930     FINALFUNC = MADLIB_SCHEMA.__array_indexed_agg_ffunc,
 <a name="l00931"></a>00931     STYPE = float8[]
 <a name="l00932"></a>00932 );
 <a name="l00933"></a>00933
 <a name="l00934"></a>00934
 <a name="l00935"></a>00935 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__dt_acc_count_sfunc
 <a name="l00936"></a>00936     (
 <a name="l00937"></a>00937     count_array         BIGINT[],
 <a name="l00938"></a>00938     num_of_class        INT,
 <a name="l00939"></a>00939     count               BIGINT,
 <a name="l00940"></a>00940     class               INT
 <a name="l00941"></a>00941     )
 <a name="l00942"></a>00942 RETURNS BIGINT[]
 <a name="l00943"></a>00943 AS &#39;MODULE_PATHNAME&#39;, &#39;dt_acc_count_sfunc&#39;
 <a name="l00944"></a>00944 LANGUAGE C VOLATILE;
 <a name="l00945"></a>00945
 <a name="l00946"></a>00946
 <a name="l00947"></a>00947 CREATE AGGREGATE MADLIB_SCHEMA.__dt_acc_count_aggr
 <a name="l00948"></a>00948     (
 <a name="l00949"></a>00949     INT,
 <a name="l00950"></a>00950     BIGINT,
 <a name="l00951"></a>00951     INT
 <a name="l00952"></a>00952     )
 <a name="l00953"></a>00953 (
 <a name="l00954"></a>00954   SFUNC=MADLIB_SCHEMA.__dt_acc_count_sfunc,
 <a name="l00955"></a>00955   m4_ifdef(`__GREENPLUM__&#39;, `prefunc=MADLIB_SCHEMA.__bigint_array_add,&#39;)
 <a name="l00956"></a>00956   STYPE=BIGINT[]
 <a name="l00957"></a>00957 );
 <a name="l00958"></a>00958
 <a name="l00959"></a>00959
 <a name="l00960"></a>00960 <span class="comment">/*</span>
 <a name="l00961"></a>00961 <span class="comment"> * @brief The aggregate is created for the PostgreSQL, which doesn&#39;t support the</span>
 <a name="l00962"></a>00962 <span class="comment"> *        function sum over an array.</span>
 <a name="l00963"></a>00963 <span class="comment"> * </span>
 <a name="l00964"></a>00964 <span class="comment"> * @param elem     The element to be fed into the returned array of this aggregate.</span>
 <a name="l00965"></a>00965 <span class="comment"> *</span>
 <a name="l00966"></a>00966 <span class="comment"> * @return The array with the sum of all the input array in a group.</span>
 <a name="l00967"></a>00967 <span class="comment"> *</span>
 <a name="l00968"></a>00968 <span class="comment"> */</span>
 <a name="l00969"></a>00969 CREATE
 <a name="l00970"></a>00970 AGGREGATE MADLIB_SCHEMA.__bigint_array_sum
 <a name="l00971"></a>00971     (
 <a name="l00972"></a>00972     BIGINT[]
 <a name="l00973"></a>00973     )
 <a name="l00974"></a>00974 (
 <a name="l00975"></a>00975   SFUNC=MADLIB_SCHEMA.__bigint_array_add,
 <a name="l00976"></a>00976   m4_ifdef(`__GREENPLUM__&#39;, `prefunc=MADLIB_SCHEMA.__bigint_array_add,&#39;)
 <a name="l00977"></a>00977   STYPE=BIGINT[]
 <a name="l00978"></a>00978 );
 <a name="l00979"></a>00979
 <a name="l00980"></a>00980
 <a name="l00981"></a>00981 <span class="comment">/*</span>
 <a name="l00982"></a>00982 <span class="comment"> * @brief This function find the best split and return the information.</span>
 <a name="l00983"></a>00983 <span class="comment"> *</span>
 <a name="l00984"></a>00984 <span class="comment"> * @param table_name          The name of the table containing the training</span>
 <a name="l00985"></a>00985 <span class="comment"> *                            set.</span>
 <a name="l00986"></a>00986 <span class="comment"> * @param confidence_level    This parameter is used by the &#39;Error-Based Pruning&#39;.</span>
 <a name="l00987"></a>00987 <span class="comment"> *                            Please refer to the paper for detailed definition.</span>
 <a name="l00988"></a>00988 <span class="comment"> *                            The paper&#39;s name is &#39;Error-Based Pruning of Decision  </span>
 <a name="l00989"></a>00989 <span class="comment"> *                            Trees Grown on Very Large Data Sets Can Work!&#39;.</span>
 <a name="l00990"></a>00990 <span class="comment"> * @param feature_table_name  Is is the name of one internal table, which contains</span>
 <a name="l00991"></a>00991 <span class="comment"> *                            meta data for each feature.</span>
 <a name="l00992"></a>00992 <span class="comment"> * @param split_criterion     It defines the split criterion to be used.</span>
 <a name="l00993"></a>00993 <span class="comment"> *                            (1- information gain. 2- gain ratio. 3- gini).</span>
 <a name="l00994"></a>00994 <span class="comment"> * @param continue_grow       It specifies whether we should still grow the tree</span>
 <a name="l00995"></a>00995 <span class="comment"> *                            on the selected branch.</span>
 <a name="l00996"></a>00996 <span class="comment"> * @param output_table        It specifies the table used to store the chosen splits.</span>
 <a name="l00997"></a>00997 <span class="comment"> * @param h2hmv_routine_id    Specifies how to handle missing values. </span>
 <a name="l00998"></a>00998 <span class="comment"> *                            1 ignore, 2 explicit.</span>
 <a name="l00999"></a>00999 <span class="comment"> *                    </span>
 <a name="l01000"></a>01000 <span class="comment"> */</span>
 <a name="l01001"></a>01001 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__find_best_split
 <a name="l01002"></a>01002     (
 <a name="l01003"></a>01003     table_name              TEXT,
 <a name="l01004"></a>01004     confidence_level        FLOAT,
 <a name="l01005"></a>01005     feature_table_name      TEXT,
 <a name="l01006"></a>01006     split_criterion         INT,
 <a name="l01007"></a>01007     continue_grow           INT,
 <a name="l01008"></a>01008     output_table            TEXT,
 <a name="l01009"></a>01009     h2hmv_routine_id        INT,
 <a name="l01010"></a>01010     num_classes             INT
 <a name="l01011"></a>01011     )
 <a name="l01012"></a>01012 RETURNS VOID AS $$
 <a name="l01013"></a>01013 DECLARE
 <a name="l01014"></a>01014     total_size         INT;
 <a name="l01015"></a>01015     curstmt            TEXT := &#39;&#39;;
 <a name="l01016"></a>01016     begin_func_exec    TIMESTAMP;
 <a name="l01017"></a>01017     select_stmt        TEXT;
 <a name="l01018"></a>01018 BEGIN
 <a name="l01019"></a>01019     begin_func_exec = clock_timestamp();
 <a name="l01020"></a>01020
 <a name="l01021"></a>01021     IF (h2hmv_routine_id=1) THEN
 <a name="l01022"></a>01022         -- For ignore, we need the true size of nodes to handle the missing values.
 <a name="l01023"></a>01023         select_stmt =
 <a name="l01024"></a>01024            &#39;SELECT t1.tid, t1.nid, t1.fid, t1.total, t2.node_size::BIGINT
 <a name="l01025"></a>01025             FROM
 <a name="l01026"></a>01026             (
 <a name="l01027"></a>01027                 SELECT tid, nid, fid,
 <a name="l01028"></a>01028                 m4_ifdef(`__GREENPLUM__&#39;, `sum(count)&#39;, `MADLIB_SCHEMA.__bigint_array_sum(count)&#39;) as total
 <a name="l01029"></a>01029                 FROM training_instance_aux
 <a name="l01030"></a>01030                 GROUP BY tid, nid, fid
 <a name="l01031"></a>01031             ) t1 INNER JOIN node_size_aux t2
 <a name="l01032"></a>01032             ON t1.tid=t2.tid AND t1.nid=t2.nid&#39;;
 <a name="l01033"></a>01033     ELSE
 <a name="l01034"></a>01034         -- For explicit, the calculated node size from the aggregation is correct.
 <a name="l01035"></a>01035         -- We can set NULL, which denotes we can safely use the counted value.
 <a name="l01036"></a>01036         select_stmt =
 <a name="l01037"></a>01037            &#39;SELECT tid, nid, fid,
 <a name="l01038"></a>01038             m4_ifdef(`__GREENPLUM__&#39;, `sum(count)&#39;, `MADLIB_SCHEMA.__bigint_array_sum(count)&#39;) as total,
 <a name="l01039"></a>01039             NULL::BIGINT AS node_size
 <a name="l01040"></a>01040             FROM training_instance_aux
 <a name="l01041"></a>01041             GROUP BY tid, nid, fid&#39;;
 <a name="l01042"></a>01042     END IF;
 <a name="l01043"></a>01043
 <a name="l01044"></a>01044     <span class="comment">/*</span>
 <a name="l01045"></a>01045 <span class="comment">     * This table is used to store information for the calculated best split </span>
 <a name="l01046"></a>01046 <span class="comment">     *</span>
 <a name="l01047"></a>01047 <span class="comment">     * tid                  The ID of the tree.</span>
 <a name="l01048"></a>01048 <span class="comment">     * node_id              The ID of one node in the specified tree.</span>
 <a name="l01049"></a>01049 <span class="comment">     * feature              The ID of the selected feature.</span>
 <a name="l01050"></a>01050 <span class="comment">     * probability          The predicted probability of our chosen class.</span>
 <a name="l01051"></a>01051 <span class="comment">     * max_class            The ID of the class chosen by the algorithm.</span>
 <a name="l01052"></a>01052 <span class="comment">     * max_scv              The maximum split criterion value.</span>
 <a name="l01053"></a>01053 <span class="comment">     * live                 1- For the chosen split, we should split further.</span>
 <a name="l01054"></a>01054 <span class="comment">     *                      0- For the chosen split, we shouldn&#39;t split further.</span>
 <a name="l01055"></a>01055 <span class="comment">     * ebp_coeff            total error for error-based pruning.</span>
 <a name="l01056"></a>01056 <span class="comment">     * is_cont              whether the selected feature is continuous.</span>
 <a name="l01057"></a>01057 <span class="comment">     * split_value          If the selected feature is continuous, it specifies</span>
 <a name="l01058"></a>01058 <span class="comment">     *                      the split value. Otherwise, it is of no use.</span>
 <a name="l01059"></a>01059 <span class="comment">     * distinct_features    The number of distinct values for the selected feature.</span>
 <a name="l01060"></a>01060 <span class="comment">     * node_size            The size of this tree node. </span>
 <a name="l01061"></a>01061 <span class="comment">     *</span>
 <a name="l01062"></a>01062 <span class="comment">     */</span>
 <a name="l01063"></a>01063     EXECUTE &#39;DROP TABLE IF EXISTS &#39;||output_table;
 <a name="l01064"></a>01064     EXECUTE &#39;CREATE TEMP TABLE &#39;||output_table||&#39;
 <a name="l01065"></a>01065     (
 <a name="l01066"></a>01066         tid                 INT,
 <a name="l01067"></a>01067         node_id             INT,
 <a name="l01068"></a>01068         feature             INT,
 <a name="l01069"></a>01069         probability         FLOAT,
 <a name="l01070"></a>01070         max_class           INTEGER,
 <a name="l01071"></a>01071         max_scv             FLOAT,
 <a name="l01072"></a>01072         live                INT,
 <a name="l01073"></a>01073         ebp_coeff           FLOAT,
 <a name="l01074"></a>01074         is_cont             BOOLEAN,
 <a name="l01075"></a>01075         split_value         FLOAT,
 <a name="l01076"></a>01076         distinct_features   INT,
 <a name="l01077"></a>01077         node_size           INT
 <a name="l01078"></a>01078     ) m4_ifdef(`__GREENPLUM__&#39;, `DISTRIBUTED BY (node_id)&#39;);&#39;;
 <a name="l01079"></a>01079
 <a name="l01080"></a>01080
 <a name="l01081"></a>01081     EXECUTE &#39;DROP TABLE IF EXISTS tmp_best_table&#39;;
 <a name="l01082"></a>01082
 <a name="l01083"></a>01083     SELECT MADLIB_SCHEMA.__format
 <a name="l01084"></a>01084         (
 <a name="l01085"></a>01085         &#39;INSERT INTO %
 <a name="l01086"></a>01086          SELECT tid, nid, best_scv[6], best_scv[4], best_scv[3], best_scv[1],
 <a name="l01087"></a>01087                 CASE WHEN (best_scv[1] &lt; 1e-9   OR
 <a name="l01088"></a>01088                            best_scv[4] &gt; 1-1e-9 OR % &lt;= 0 ) THEN
 <a name="l01089"></a>01089                         0
 <a name="l01090"></a>01090                 ELSE
 <a name="l01091"></a>01091                         1
 <a name="l01092"></a>01092                 END AS live,
 <a name="l01093"></a>01093                 MADLIB_SCHEMA.__ebp_calc_errors
 <a name="l01094"></a>01094                     (best_scv[5], best_scv[4], %) AS ebp_coeff,
 <a name="l01095"></a>01095                 o2.is_cont,
 <a name="l01096"></a>01096                 CASE WHEN( o2.is_cont ) THEN
 <a name="l01097"></a>01097                     best_scv[7]
 <a name="l01098"></a>01098                 ELSE
 <a name="l01099"></a>01099                     NULL
 <a name="l01100"></a>01100                 END AS split_value,
 <a name="l01101"></a>01101                 o2.num_dist_value, best_scv[5]
 <a name="l01102"></a>01102         FROM
 <a name="l01103"></a>01103         (
 <a name="l01104"></a>01104             SELECT s1.tid, s1.nid,
 <a name="l01105"></a>01105                 MADLIB_SCHEMA.__best_scv_aggr(scv, s1.fid,
 <a name="l01106"></a>01106                     coalesce(s1.split_value,0)) as best_scv
 <a name="l01107"></a>01107             FROM (
 <a name="l01108"></a>01108                 SELECT t1.tid, t1.nid, t1.fid, split_value,
 <a name="l01109"></a>01109                         MADLIB_SCHEMA.__scv_aggr
 <a name="l01110"></a>01110                             (%, is_cont, %, le, total, t2.node_size) AS scv
 <a name="l01111"></a>01111                 FROM
 <a name="l01112"></a>01112                     (
 <a name="l01113"></a>01113                         SELECT tid, nid, fid, fval, is_cont,
 <a name="l01114"></a>01114                         CASE WHEN (is_cont) THEN
 <a name="l01115"></a>01115                            fval
 <a name="l01116"></a>01116                         ELSE
 <a name="l01117"></a>01117                             NULL::FLOAT8
 <a name="l01118"></a>01118                         END AS split_value,
 <a name="l01119"></a>01119                         CASE WHEN (is_cont) THEN
 <a name="l01120"></a>01120                                 m4_ifdef(`__GREENPLUM__&#39;, `sum(count)&#39;, `MADLIB_SCHEMA.__bigint_array_sum(count)&#39;) OVER
 <a name="l01121"></a>01121                                     (PARTITION BY tid, nid, fid ORDER BY fval
 <a name="l01122"></a>01122                                      ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)
 <a name="l01123"></a>01123                         ELSE
 <a name="l01124"></a>01124                                 count
 <a name="l01125"></a>01125                         END AS le
 <a name="l01126"></a>01126                         FROM training_instance_aux
 <a name="l01127"></a>01127                     ) t1,
 <a name="l01128"></a>01128                     (
 <a name="l01129"></a>01129                         %
 <a name="l01130"></a>01130                     ) t2
 <a name="l01131"></a>01131                 WHERE t1.tid = t2.tid AND t1.nid = t2.nid AND t1.fid = t2.fid
 <a name="l01132"></a>01132                 GROUP BY t1.tid, t1.nid, t1.fid, split_value
 <a name="l01133"></a>01133             ) s1
 <a name="l01134"></a>01134             GROUP BY s1.tid, s1.nid
 <a name="l01135"></a>01135         ) o1 INNER JOIN % o2 ON o1.best_scv[6]::INT=o2.<span class="keywordtype">id</span>&#39;,
 <a name="l01136"></a>01136             ARRAY[
 <a name="l01137"></a>01137                 output_table,
 <a name="l01138"></a>01138                 continue_grow::TEXT,
 <a name="l01139"></a>01139                 confidence_level::TEXT,
 <a name="l01140"></a>01140                 split_criterion::TEXT,
 <a name="l01141"></a>01141                 num_classes::TEXT,
 <a name="l01142"></a>01142                 select_stmt,
 <a name="l01143"></a>01143                 feature_table_name
 <a name="l01144"></a>01144             ]
 <a name="l01145"></a>01145         ) INTO curstmt;
 <a name="l01146"></a>01146
 <a name="l01147"></a>01147     EXECUTE curstmt;
 <a name="l01148"></a>01148
 <a name="l01149"></a>01149     RETURN;
 <a name="l01150"></a>01150 END
 <a name="l01151"></a>01151 $$ LANGUAGE PLPGSQL;
 <a name="l01152"></a>01152
 <a name="l01153"></a>01153
 <a name="l01154"></a>01154 <span class="comment">/*</span>
 <a name="l01155"></a>01155 <span class="comment"> * @brief For training one decision tree, we need some internal tables</span>
 <a name="l01156"></a>01156 <span class="comment"> *        to store intermediate results. This function creates those</span>
 <a name="l01157"></a>01157 <span class="comment"> *        tables. Moreover, this function also creates the tree table</span>
 <a name="l01158"></a>01158 <span class="comment"> *        specified by user.</span>
 <a name="l01159"></a>01159 <span class="comment"> *</span>
 <a name="l01160"></a>01160 <span class="comment"> * @param result_tree_table_name  The name of the tree specified by user. </span>
 <a name="l01161"></a>01161 <span class="comment"> *                    </span>
 <a name="l01162"></a>01162 <span class="comment"> */</span>
 <a name="l01163"></a>01163 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__create_tree_tables
 <a name="l01164"></a>01164     (
 <a name="l01165"></a>01165     result_tree_table_name TEXT
 <a name="l01166"></a>01166     )
 <a name="l01167"></a>01167 RETURNS <span class="keywordtype">void</span> AS $$
 <a name="l01168"></a>01168 BEGIN
 <a name="l01169"></a>01169     --  The table of node_size_aux records the size of each node. It is used
 <a name="l01170"></a>01170     --  for missing value handling.
 <a name="l01171"></a>01171     DROP TABLE IF EXISTS node_size_aux CASCADE;
 <a name="l01172"></a>01172     CREATE TEMP TABLE node_size_aux
 <a name="l01173"></a>01173     (
 <a name="l01174"></a>01174         tid             INT,
 <a name="l01175"></a>01175         nid             INT,
 <a name="l01176"></a>01176         node_size       INT
 <a name="l01177"></a>01177     )m4_ifdef(`__GREENPLUM__&#39;, `DISTRIBUTED BY (tid,nid)&#39;);
 <a name="l01178"></a>01178
 <a name="l01179"></a>01179     -- The table below stores the decision tree information just constructed.
 <a name="l01180"></a>01180     -- Columns:
 <a name="l01181"></a>01181     --      <span class="keywordtype">id</span>:             The ID of the node represented by this row. Tree
 <a name="l01182"></a>01182     --                      node IDs are unique across all trees. The IDs of
 <a name="l01183"></a>01183     --                      all children of a node is made to be continuous.
 <a name="l01184"></a>01184     --      tree_location:  An array containing the encoded values of all the
 <a name="l01185"></a>01185     --                      features on the path from the root node to the
 <a name="l01186"></a>01186     --                      current node. For the root node, the location
 <a name="l01187"></a>01187     --                      value is {0}.
 <a name="l01188"></a>01188     --      feature:        The ID of the best split feature chosen <span class="keywordflow">for</span> the
 <a name="l01189"></a>01189     --                      node represented by <span class="keyword">this</span> row.
 <a name="l01190"></a>01190     --      probability:    If forced to make a call <span class="keywordflow">for</span> a dominant <span class="keyword">class </span>
 <a name="l01191"></a>01191     --                      at a given point this would be the confidence of the
 <a name="l01192"></a>01192     --                      call (this is only an estimated value).
 <a name="l01193"></a>01193     --      ebp_coeff:      The total errors used by error based pruning (ebp)
 <a name="l01194"></a>01194     --                      based on the specified confidence level. RF does
 <a name="l01195"></a>01195     --                      not do EBP therefore for RF nodes, this column always
 <a name="l01196"></a>01196     --                      contains 1.
 <a name="l01197"></a>01197     --      max_class:      If forced to make a call for a dominant class
 <a name="l01198"></a>01198     --                      at a given point this is the selected class.
 <a name="l01199"></a>01199     --      scv:            The splitting criteria value (scv) computed at this node.
 <a name="l01200"></a>01200     --      live:           Specifies whether the node should be further split
 <a name="l01201"></a>01201     --                      or not. A positive value indicates further split of
 <a name="l01202"></a>01202     --                      the node represented by this row is needed.
 <a name="l01203"></a>01203     --      num_of_samples: The number of samples at this node.
 <a name="l01204"></a>01204     --      parent_id:      Id of the parent branch.
 <a name="l01205"></a>01205     --      lmc_nid:        Leftmost child (lmc) node id of the node represented
 <a name="l01206"></a>01206     --                      by the current row.
 <a name="l01207"></a>01207     --      lmc_fval:       The feature value which leads to the lmc node.
 <a name="l01208"></a>01208     --                      An example of getting all the child nodes&#39; ids
 <a name="l01209"></a>01209     --                      and condition values
 <a name="l01210"></a>01210     --                      1. Get the right most node id
 <a name="l01211"></a>01211     --                      SELECT DISTINCT ON(parent_id) id FROM tree_table
 <a name="l01212"></a>01212     --                      WHERE parent_id = $pid ORDER BY parent_id, id desc
 <a name="l01213"></a>01213     --                      INTO max_child_nid;
 <a name="l01214"></a>01214     --                      2. Get child nodes&#39; ids and condition values by a
 <a name="l01215"></a>01215     --                         while loop
 <a name="l01216"></a>01216     --                      node_count = 1;
 <a name="l01217"></a>01217     --                      WHILE (lmc_nid IS NOT NULL) AND
 <a name="l01218"></a>01218     --                          (0 &lt; node_count AND lmc_nid &lt;= max_child_nid) LOOP
 <a name="l01219"></a>01219     --                          ...
 <a name="l01220"></a>01220     --                          lmc_nid  = lmc_nid  + 1;
 <a name="l01221"></a>01221     --                          lmc_fval = lmc_fval + 1;
 <a name="l01222"></a>01222     --                          SELECT COUNT(id) FROM tree_table
 <a name="l01223"></a>01223     --                          WHERE id = $lmc_nid AND parent_id = $pid
 <a name="l01224"></a>01224     --                          INTO node_count;
 <a name="l01225"></a>01225     --                      END LOOP;
 <a name="l01226"></a>01226     --      is_cont:        It specifies whether the selected feature is a
 <a name="l01227"></a>01227     --                      continuous feature.
 <a name="l01228"></a>01228     --      split_value:    For continuous feature, it specifies the split value.
 <a name="l01229"></a>01229     --                      Otherwise, it is of no meaning and fixed to 0.
 <a name="l01230"></a>01230     --      tid:            The id of a tree that this node belongs to.
 <a name="l01231"></a>01231     --      dp_ids:         An array containing the IDs of the non-continuous
 <a name="l01232"></a>01232     --                      features chosen by all ancestors nodes (starting
 <a name="l01233"></a>01233     --                      from the root) for splitting.
 <a name="l01234"></a>01234     --
 <a name="l01235"></a>01235     -- The table below stores the final decision tree information.
 <a name="l01236"></a>01236     -- It is an the table specified by users.
 <a name="l01237"></a>01237     -- Please refer the table above for detailed column definition.
 <a name="l01238"></a>01238     EXECUTE &#39;DROP TABLE IF EXISTS &#39;||result_tree_table_name||&#39; CASCADE;&#39;;
 <a name="l01239"></a>01239     EXECUTE &#39;CREATE TABLE &#39;||result_tree_table_name||&#39;
 <a name="l01240"></a>01240     (
 <a name="l01241"></a>01241         id              INT,
 <a name="l01242"></a>01242         tree_location   INT[],
 <a name="l01243"></a>01243         feature         INT,
 <a name="l01244"></a>01244         probability     FLOAT,
 <a name="l01245"></a>01245         ebp_coeff       FLOAT,
 <a name="l01246"></a>01246         max_class       INTEGER,
 <a name="l01247"></a>01247         scv             FLOAT,
 <a name="l01248"></a>01248         live            INT,
 <a name="l01249"></a>01249         num_of_samples  INT,
 <a name="l01250"></a>01250         parent_id       INT,
 <a name="l01251"></a>01251         lmc_nid         INT,
 <a name="l01252"></a>01252         lmc_fval        INT,
 <a name="l01253"></a>01253         is_cont         BOOLEAN,
 <a name="l01254"></a>01254         split_value     FLOAT,
 <a name="l01255"></a>01255         tid             INT,
 <a name="l01256"></a>01256         dp_ids          INT[]
 <a name="l01257"></a>01257     ) m4_ifdef(`__GREENPLUM__&#39;, `DISTRIBUTED BY (tid,id)&#39;);&#39;;
 <a name="l01258"></a>01258
 <a name="l01259"></a>01259     -- The following table stored the auxiliary information for updating the
 <a name="l01260"></a>01260     -- association table, so that the updating operation only need to
 <a name="l01261"></a>01261     -- join the encoded table with association table once
 <a name="l01262"></a>01262     EXECUTE &#39;DROP TABLE IF EXISTS assoc_aux CASCADE&#39;;
 <a name="l01263"></a>01263     CREATE TEMP TABLE assoc_aux
 <a name="l01264"></a>01264     (
 <a name="l01265"></a>01265         nid         INT,
 <a name="l01266"></a>01266         fid         INT,
 <a name="l01267"></a>01267         lmc_id      INT,
 <a name="l01268"></a>01268         svalue      FLOAT,
 <a name="l01269"></a>01269         is_cont     BOOL
 <a name="l01270"></a>01270     ) m4_ifdef(`__GREENPLUM__&#39;, `DISTRIBUTED BY (nid)&#39;);
 <a name="l01271"></a>01271
 <a name="l01272"></a>01272     EXECUTE &#39;DROP TABLE IF EXISTS tr_assoc_ping CASCADE&#39;;
 <a name="l01273"></a>01273     EXECUTE &#39;DROP TABLE IF EXISTS tr_assoc_pong CASCADE&#39;;
 <a name="l01274"></a>01274     EXECUTE &#39;DROP TABLE IF EXISTS sf_assoc CASCADE&#39;;
 <a name="l01275"></a>01275
 <a name="l01276"></a>01276 m4_changequote(`&gt;&gt;&gt;&#39;, `&lt;&lt;&lt;&#39;)
 <a name="l01277"></a>01277 m4_ifdef(&gt;&gt;&gt;__GREENPLUM_GE_4_2_1__&lt;&lt;&lt;, &gt;&gt;&gt;
 <a name="l01278"></a>01278     CREATE TEMP TABLE tr_assoc_ping
 <a name="l01279"></a>01279     (
 <a name="l01280"></a>01280         id      BIGINT ENCODING (compresstype=RLE_TYPE),
 <a name="l01281"></a>01281         nid     INT    ENCODING (compresstype=RLE_TYPE),
 <a name="l01282"></a>01282         tid     INT    ENCODING (compresstype=RLE_TYPE),
 <a name="l01283"></a>01283         weight  INT    ENCODING (compresstype=RLE_TYPE)
 <a name="l01284"></a>01284     )
 <a name="l01285"></a>01285     WITH(appendonly=true, orientation=column)
 <a name="l01286"></a>01286     DISTRIBUTED BY(id);
 <a name="l01287"></a>01287
 <a name="l01288"></a>01288     CREATE TEMP TABLE tr_assoc_pong
 <a name="l01289"></a>01289     (
 <a name="l01290"></a>01290         id      BIGINT ENCODING (compresstype=RLE_TYPE),
 <a name="l01291"></a>01291         nid     INT    ENCODING (compresstype=RLE_TYPE),
 <a name="l01292"></a>01292         tid     INT    ENCODING (compresstype=RLE_TYPE),
 <a name="l01293"></a>01293         weight  INT    ENCODING (compresstype=RLE_TYPE)
 <a name="l01294"></a>01294     )
 <a name="l01295"></a>01295     WITH(appendonly=true, orientation=column)
 <a name="l01296"></a>01296     DISTRIBUTED BY(id);
 <a name="l01297"></a>01297
 <a name="l01298"></a>01298     CREATE TEMP TABLE sf_assoc
 <a name="l01299"></a>01299     (
 <a name="l01300"></a>01300         nid     INT    ENCODING (compresstype=RLE_TYPE),
 <a name="l01301"></a>01301         fid     INT    ENCODING (compresstype=RLE_TYPE)
 <a name="l01302"></a>01302     )
 <a name="l01303"></a>01303     WITH(appendonly=true, orientation=column)
 <a name="l01304"></a>01304     DISTRIBUTED BY(fid);
 <a name="l01305"></a>01305 &lt;&lt;&lt;, &gt;&gt;&gt;
 <a name="l01306"></a>01306     CREATE TEMP TABLE tr_assoc_ping
 <a name="l01307"></a>01307     (
 <a name="l01308"></a>01308         id      BIGINT,
 <a name="l01309"></a>01309         nid     INT,
 <a name="l01310"></a>01310         tid     INT,
 <a name="l01311"></a>01311         weight  INT
 <a name="l01312"></a>01312     )m4_ifdef(`__GREENPLUM__&#39;, `DISTRIBUTED BY (id)&#39;);
 <a name="l01313"></a>01313     CREATE TEMP TABLE tr_assoc_pong
 <a name="l01314"></a>01314     (
 <a name="l01315"></a>01315         id      BIGINT,
 <a name="l01316"></a>01316         nid     INT,
 <a name="l01317"></a>01317         tid     INT,
 <a name="l01318"></a>01318         weight  INT
 <a name="l01319"></a>01319     )m4_ifdef(`__GREENPLUM__&#39;, `DISTRIBUTED BY (id)&#39;);
 <a name="l01320"></a>01320     CREATE TEMP TABLE sf_assoc
 <a name="l01321"></a>01321     (
 <a name="l01322"></a>01322         nid     INT,
 <a name="l01323"></a>01323         fid     INT
 <a name="l01324"></a>01324     )m4_ifdef(`__GREENPLUM__&#39;, `DISTRIBUTED BY (fid)&#39;);
 <a name="l01325"></a>01325 &lt;&lt;&lt;)
 <a name="l01326"></a>01326 m4_changequote(&gt;&gt;&gt;`&lt;&lt;&lt;, &gt;&gt;&gt;&#39;&lt;&lt;&lt;)
 <a name="l01327"></a>01327 END
 <a name="l01328"></a>01328 $$ LANGUAGE PLPGSQL;
 <a name="l01329"></a>01329
 <a name="l01330"></a>01330
 <a name="l01331"></a>01331 <span class="comment">/*</span>
 <a name="l01332"></a>01332 <span class="comment"> * @brief Prune the trained tree with &quot;Reduced Error Pruning&quot; algorithm.</span>
 <a name="l01333"></a>01333 <span class="comment"> *</span>
 <a name="l01334"></a>01334 <span class="comment"> * @param tree_table_name   The name of the table containing the tree. </span>
 <a name="l01335"></a>01335 <span class="comment"> * @param validation_table  The name of the table containing validation set. </span>
 <a name="l01336"></a>01336 <span class="comment"> * @param max_num_classes   The count of different classes. </span>
 <a name="l01337"></a>01337 <span class="comment"> *                    </span>
 <a name="l01338"></a>01338 <span class="comment"> */</span>
 <a name="l01339"></a>01339 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__rep_prune_tree
 <a name="l01340"></a>01340     (
 <a name="l01341"></a>01341     tree_table_name     TEXT,
 <a name="l01342"></a>01342     validation_table    TEXT,
 <a name="l01343"></a>01343     max_num_classes     INT
 <a name="l01344"></a>01344     )
 <a name="l01345"></a>01345 RETURNS void AS $$
 <a name="l01346"></a>01346 DECLARE
 <a name="l01347"></a>01347     num_parent_ids          INTEGER;
 <a name="l01348"></a>01348     cf_table_name           TEXT;
 <a name="l01349"></a>01349     encoded_table_name      TEXT;
 <a name="l01350"></a>01350     metatable_name          TEXT;
 <a name="l01351"></a>01351     curstmt                 TEXT;
 <a name="l01352"></a>01352     id_col_name             TEXT;
 <a name="l01353"></a>01353     class_col_name          TEXT;
 <a name="l01354"></a>01354     classify_result         TEXT;
 <a name="l01355"></a>01355     temp_text               TEXT;
 <a name="l01356"></a>01356     n                       INT;
 <a name="l01357"></a>01357     table_names             TEXT[];
 <a name="l01358"></a>01358 BEGIN
 <a name="l01359"></a>01359     metatable_name  = MADLIB_SCHEMA.__get_metatable_name(tree_table_name);
 <a name="l01360"></a>01360     id_col_name     = MADLIB_SCHEMA.__get_id_column_name(metatable_name);
 <a name="l01361"></a>01361     class_col_name  = MADLIB_SCHEMA.__get_class_column_name(metatable_name);
 <a name="l01362"></a>01362
 <a name="l01363"></a>01363     -- the value of class column in validation table must in the KV table
 <a name="l01364"></a>01364     SELECT MADLIB_SCHEMA.__format
 <a name="l01365"></a>01365         (
 <a name="l01366"></a>01366         &#39;SELECT COUNT(*)
 <a name="l01367"></a>01367          FROM %
 <a name="l01368"></a>01368          WHERE MADLIB_SCHEMA.__to_char(%) NOT IN
 <a name="l01369"></a>01369             (SELECT fval FROM % WHERE fval IS NOT NULL)&#39;,
 <a name="l01370"></a>01370         ARRAY[
 <a name="l01371"></a>01371             validation_table,
 <a name="l01372"></a>01372             class_col_name,
 <a name="l01373"></a>01373             MADLIB_SCHEMA.__get_classtable_name(metatable_name)
 <a name="l01374"></a>01374         ]
 <a name="l01375"></a>01375         )
 <a name="l01376"></a>01376     INTO curstmt;
 <a name="l01377"></a>01377
 <a name="l01378"></a>01378     EXECUTE curstmt INTO n;
 <a name="l01379"></a>01379
 <a name="l01380"></a>01380     PERFORM MADLIB_SCHEMA.__assert
 <a name="l01381"></a>01381             (
 <a name="l01382"></a>01382                 n = 0,
 <a name="l01383"></a>01383                 &#39;the value of class column in validation table must in
 <a name="l01384"></a>01384                  training table&#39;
 <a name="l01385"></a>01385             );
 <a name="l01386"></a>01386
 <a name="l01387"></a>01387     table_names = MADLIB_SCHEMA.__treemodel_classify_internal
 <a name="l01388"></a>01388                   (
 <a name="l01389"></a>01389                     validation_table,
 <a name="l01390"></a>01390                     tree_table_name,
 <a name="l01391"></a>01391                     0
 <a name="l01392"></a>01392                   );
 <a name="l01393"></a>01393
 <a name="l01394"></a>01394     encoded_table_name = table_names[1];
 <a name="l01395"></a>01395     classify_result    = table_names[2];
 <a name="l01396"></a>01396     cf_table_name      = classify_result;
 <a name="l01397"></a>01397
 <a name="l01398"></a>01398     -- after encoding in classification, class_col_name is fixed to class
 <a name="l01399"></a>01399     class_col_name  = &#39;class&#39;;
 <a name="l01400"></a>01400
 <a name="l01401"></a>01401 m4_changequote(`&gt;&gt;&gt;&#39;, `&lt;&lt;&lt;&#39;)
 <a name="l01402"></a>01402 m4_ifdef(&gt;&gt;&gt;__GREENPLUM_PRE_4_1__&lt;&lt;&lt;, &gt;&gt;&gt;
 <a name="l01403"></a>01403     EXECUTE &#39;DROP TABLE IF EXISTS tree_rep_pong CASCADE&#39;;
 <a name="l01404"></a>01404     EXECUTE &#39;CREATE TEMP TABLE tree_rep_pong AS SELECT * FROM &#39; ||
 <a name="l01405"></a>01405              classify_result ||
 <a name="l01406"></a>01406              &#39; LIMIT 0 m4_ifdef(`__GREENPLUM__&#39;, `DISTRIBUTED BY (id)&#39;)&#39;;
 <a name="l01407"></a>01407 &lt;&lt;&lt;)
 <a name="l01408"></a>01408 m4_changequote(&gt;&gt;&gt;`&lt;&lt;&lt;, &gt;&gt;&gt;&#39;&lt;&lt;&lt;)
 <a name="l01409"></a>01409
 <a name="l01410"></a>01410     LOOP
 <a name="l01411"></a>01411         DROP TABLE IF EXISTS selected_parent_ids_rep;
 <a name="l01412"></a>01412         CREATE TEMP TABLE selected_parent_ids_rep
 <a name="l01413"></a>01413         (
 <a name="l01414"></a>01414             parent_id BIGINT,
 <a name="l01415"></a>01415             max_class  INT
 <a name="l01416"></a>01416         ) m4_ifdef(`__GREENPLUM__&#39;, `DISTRIBUTED BY (parent_id)&#39;);
 <a name="l01417"></a>01417
 <a name="l01418"></a>01418         SELECT MADLIB_SCHEMA.__format
 <a name="l01419"></a>01419             (
 <a name="l01420"></a>01420                 &#39;INSERT INTO selected_parent_ids_rep
 <a name="l01421"></a>01421                 SELECT parent_id, t.g[1] as max_class
 <a name="l01422"></a>01422                 FROM
 <a name="l01423"></a>01423                 (
 <a name="l01424"></a>01424                     SELECT parent_id,
 <a name="l01425"></a>01425                            MADLIB_SCHEMA.__rep_aggr_class_count
 <a name="l01426"></a>01426                                (
 <a name="l01427"></a>01427                                c.class,
 <a name="l01428"></a>01428                                s.%,
 <a name="l01429"></a>01429                                %
 <a name="l01430"></a>01430                                ) AS g
 <a name="l01431"></a>01431                     FROM % c, % s
 <a name="l01432"></a>01432                     WHERE c.id=s.%
 <a name="l01433"></a>01433                     GROUP BY parent_id
 <a name="l01434"></a>01434                 ) t
 <a name="l01435"></a>01435                 WHERE t.g[2] &gt;= 0 AND
 <a name="l01436"></a>01436                       t.parent_id IN
 <a name="l01437"></a>01437                       (
 <a name="l01438"></a>01438                           Select parent_id FROM %
 <a name="l01439"></a>01439                           WHERE parent_id NOT IN
 <a name="l01440"></a>01440                               (
 <a name="l01441"></a>01441                                   Select parent_id
 <a name="l01442"></a>01442                                   FROM %
 <a name="l01443"></a>01443                                   WHERE lmc_nid IS NOT NULL
 <a name="l01444"></a>01444                               ) and id &lt;&gt; 1
 <a name="l01445"></a>01445                       );&#39;,
 <a name="l01446"></a>01446                   ARRAY[
 <a name="l01447"></a>01447                       class_col_name,
 <a name="l01448"></a>01448                       MADLIB_SCHEMA.__to_char(max_num_classes),
 <a name="l01449"></a>01449                       classify_result,
 <a name="l01450"></a>01450                       encoded_table_name,
 <a name="l01451"></a>01451                       id_col_name,
 <a name="l01452"></a>01452                       tree_table_name,
 <a name="l01453"></a>01453                       tree_table_name
 <a name="l01454"></a>01454                   ]
 <a name="l01455"></a>01455               )
 <a name="l01456"></a>01456               INTO curstmt;
 <a name="l01457"></a>01457
 <a name="l01458"></a>01458         EXECUTE curstmt;
 <a name="l01459"></a>01459
 <a name="l01460"></a>01460         EXECUTE &#39;SELECT parent_id FROM selected_parent_ids_rep limit 1;&#39;
 <a name="l01461"></a>01461             INTO num_parent_ids;
 <a name="l01462"></a>01462         IF (num_parent_ids IS NULL)  THEN
 <a name="l01463"></a>01463             EXIT;
 <a name="l01464"></a>01464         END IF;
 <a name="l01465"></a>01465
 <a name="l01466"></a>01466 m4_changequote(`&gt;&gt;&gt;&#39;, `&lt;&lt;&lt;&#39;)
 <a name="l01467"></a>01467 m4_ifdef(`__GREENPLUM_PRE_4_1__&#39;, &gt;&gt;&gt;
 <a name="l01468"></a>01468         -- for some databases, update operation can&#39;t distribute data across segments
 <a name="l01469"></a>01469         -- we use two tables to update the data
 <a name="l01470"></a>01470         IF (classify_result = &#39;tree_rep_pong&#39;) THEN
 <a name="l01471"></a>01471             temp_text = cf_table_name;
 <a name="l01472"></a>01472         ELSE
 <a name="l01473"></a>01473             temp_text =  &#39;tree_rep_pong&#39;;
 <a name="l01474"></a>01474         END IF;
 <a name="l01475"></a>01475
 <a name="l01476"></a>01476         EXECUTE &#39;TRUNCATE &#39; ||  temp_text;
 <a name="l01477"></a>01477         SELECT MADLIB_SCHEMA.__format
 <a name="l01478"></a>01478             (
 <a name="l01479"></a>01479             &#39;INSERT INTO %(id, class, parent_id, leaf_id)
 <a name="l01480"></a>01480              SELECT m.id,  t.max_class, t.parent_id, t.id
 <a name="l01481"></a>01481              FROM % m, % t
 <a name="l01482"></a>01482              WHERE t.id IN (SELECT parent_id FROM selected_parent_ids_rep) AND
 <a name="l01483"></a>01483              m.parent_id = t.id&#39;,
 <a name="l01484"></a>01484             ARRAY[
 <a name="l01485"></a>01485                 temp_text,
 <a name="l01486"></a>01486                 classify_result,
 <a name="l01487"></a>01487                 tree_table_name
 <a name="l01488"></a>01488             ]
 <a name="l01489"></a>01489             )
 <a name="l01490"></a>01490         INTO curstmt;
 <a name="l01491"></a>01491
 <a name="l01492"></a>01492         EXECUTE curstmt;
 <a name="l01493"></a>01493
 <a name="l01494"></a>01494         classify_result = temp_text;
 <a name="l01495"></a>01495 &lt;&lt;&lt;, &gt;&gt;&gt;
 <a name="l01496"></a>01496         SELECT MADLIB_SCHEMA.__format
 <a name="l01497"></a>01497             (
 <a name="l01498"></a>01498                 &#39;UPDATE % m set class = t.max_class,
 <a name="l01499"></a>01499                  parent_id = t.parent_id,leaf_id = t.id
 <a name="l01500"></a>01500                  FROM % t
 <a name="l01501"></a>01501                  WHERE t.id IN (SELECT parent_id FROM selected_parent_ids_rep) AND
 <a name="l01502"></a>01502                  m.parent_id=t.id&#39;,
 <a name="l01503"></a>01503                 classify_result,
 <a name="l01504"></a>01504                 tree_table_name
 <a name="l01505"></a>01505             )
 <a name="l01506"></a>01506         INTO curstmt;
 <a name="l01507"></a>01507         EXECUTE curstmt;
 <a name="l01508"></a>01508 &lt;&lt;&lt;)
 <a name="l01509"></a>01509 m4_changequote(&gt;&gt;&gt;`&lt;&lt;&lt;, &gt;&gt;&gt;&#39;&lt;&lt;&lt;)
 <a name="l01510"></a>01510
 <a name="l01511"></a>01511         SELECT MADLIB_SCHEMA.__format
 <a name="l01512"></a>01512             (
 <a name="l01513"></a>01513                 &#39;DELETE FROM % WHERE parent_id IN
 <a name="l01514"></a>01514                  (SELECT parent_id FROM selected_parent_ids_rep)&#39;,
 <a name="l01515"></a>01515                 tree_table_name
 <a name="l01516"></a>01516             )
 <a name="l01517"></a>01517             INTO curstmt;
 <a name="l01518"></a>01518
 <a name="l01519"></a>01519         EXECUTE curstmt;
 <a name="l01520"></a>01520
 <a name="l01521"></a>01521         SELECT MADLIB_SCHEMA.__format
 <a name="l01522"></a>01522             (
 <a name="l01523"></a>01523                 &#39;UPDATE % t1 SET lmc_nid = NULL,
 <a name="l01524"></a>01524                  lmc_fval = NULL, max_class = t2.max_class
 <a name="l01525"></a>01525                  FROM selected_parent_ids_rep t2
 <a name="l01526"></a>01526                  WHERE t1.id = t2.parent_id;&#39;,
 <a name="l01527"></a>01527                 tree_table_name
 <a name="l01528"></a>01528             )
 <a name="l01529"></a>01529             INTO curstmt;
 <a name="l01530"></a>01530
 <a name="l01531"></a>01531         EXECUTE curstmt;
 <a name="l01532"></a>01532
 <a name="l01533"></a>01533     END LOOP;
 <a name="l01534"></a>01534
 <a name="l01535"></a>01535     EXECUTE &#39;DROP TABLE IF EXISTS &#39; || encoded_table_name || &#39; CASCADE;&#39;;
 <a name="l01536"></a>01536 END
 <a name="l01537"></a>01537 $$ LANGUAGE PLPGSQL;
 <a name="l01538"></a>01538
 <a name="l01539"></a>01539
 <a name="l01540"></a>01540 <span class="comment">/*</span>
 <a name="l01541"></a>01541 <span class="comment"> * @brief Calculates the total errors used by Error Based Pruning (EBP).</span>
 <a name="l01542"></a>01542 <span class="comment"> *</span>
 <a name="l01543"></a>01543 <span class="comment"> * @param total             The number of total samples represented by the node </span>
 <a name="l01544"></a>01544 <span class="comment"> *                          being processed. </span>
 <a name="l01545"></a>01545 <span class="comment"> * @param prob              The probability to mis-classify samples represented by the </span>
 <a name="l01546"></a>01546 <span class="comment"> *                          child nodes if they are pruned with EBP. </span>
 <a name="l01547"></a>01547 <span class="comment"> * @param confidence_level  A certainty factor to calculate the confidence limits</span>
 <a name="l01548"></a>01548 <span class="comment"> *                          for the probability of error using the binomial theorem. </span>
 <a name="l01549"></a>01549 <span class="comment"> *  </span>
 <a name="l01550"></a>01550 <span class="comment"> * @return The computed total error.</span>
 <a name="l01551"></a>01551 <span class="comment"> *</span>
 <a name="l01552"></a>01552 <span class="comment"> */</span>
 <a name="l01553"></a>01553 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__ebp_calc_errors
 <a name="l01554"></a>01554     (
 <a name="l01555"></a>01555     total               FLOAT8,
 <a name="l01556"></a>01556     prob                FLOAT8,
 <a name="l01557"></a>01557     confidence_level    FLOAT8
 <a name="l01558"></a>01558     ) RETURNS FLOAT8
 <a name="l01559"></a>01559 AS &#39;MODULE_PATHNAME&#39;, &#39;dt_ebp_calc_errors&#39;
 <a name="l01560"></a>01560 LANGUAGE C STRICT IMMUTABLE;
 <a name="l01561"></a>01561
 <a name="l01562"></a>01562
 <a name="l01563"></a>01563 <span class="comment">/*</span>
 <a name="l01564"></a>01564 <span class="comment"> * @brief Prune the trained tree with &quot;Error-based Pruning&quot; algorithm.</span>
 <a name="l01565"></a>01565 <span class="comment"> *</span>
 <a name="l01566"></a>01566 <span class="comment"> * @param tree_table_name  The name of the table containing the tree. </span>
 <a name="l01567"></a>01567 <span class="comment"> *  </span>
 <a name="l01568"></a>01568 <span class="comment"> */</span>
 <a name="l01569"></a>01569 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__ebp_prune_tree
 <a name="l01570"></a>01570     (
 <a name="l01571"></a>01571     tree_table_name TEXT
 <a name="l01572"></a>01572     )
 <a name="l01573"></a>01573 RETURNS void AS $$
 <a name="l01574"></a>01574 DECLARE
 <a name="l01575"></a>01575     num_parent_ids INTEGER;
 <a name="l01576"></a>01576     curstmt TEXT;
 <a name="l01577"></a>01577 BEGIN
 <a name="l01578"></a>01578     LOOP
 <a name="l01579"></a>01579         DROP TABLE IF EXISTS selected_parent_ids_ebp;
 <a name="l01580"></a>01580         CREATE TEMP TABLE selected_parent_ids_ebp(parent_id BIGINT)
 <a name="l01581"></a>01581             m4_ifdef(`__GREENPLUM__&#39;, `DISTRIBUTED BY(parent_id)&#39;);
 <a name="l01582"></a>01582
 <a name="l01583"></a>01583         SELECT MADLIB_SCHEMA.__format
 <a name="l01584"></a>01584             (
 <a name="l01585"></a>01585                 &#39;INSERT INTO selected_parent_ids_ebp
 <a name="l01586"></a>01586                 SELECT s.parent_id as parent_id
 <a name="l01587"></a>01587                 FROM
 <a name="l01588"></a>01588                 (
 <a name="l01589"></a>01589                     Select parent_id, sum(ebp_coeff) as ebp_coeff
 <a name="l01590"></a>01590                     FROM
 <a name="l01591"></a>01591                     (
 <a name="l01592"></a>01592                         Select parent_id, ebp_coeff
 <a name="l01593"></a>01593                         FROM %
 <a name="l01594"></a>01594                         WHERE parent_id NOT IN
 <a name="l01595"></a>01595                             (
 <a name="l01596"></a>01596                             Select parent_id  FROM % WHERE lmc_nid IS NOT NULL
 <a name="l01597"></a>01597                             )  and id &lt;&gt; 1
 <a name="l01598"></a>01598                     ) m
 <a name="l01599"></a>01599                     GROUP BY m.parent_id
 <a name="l01600"></a>01600                  ) s
 <a name="l01601"></a>01601                  LEFT JOIN  % p
 <a name="l01602"></a>01602                     ON p.id = s.parent_id
 <a name="l01603"></a>01603                  WHERE  p.ebp_coeff &lt; s.ebp_coeff;&#39;,
 <a name="l01604"></a>01604                  tree_table_name,
 <a name="l01605"></a>01605                  tree_table_name,
 <a name="l01606"></a>01606                  tree_table_name
 <a name="l01607"></a>01607             )
 <a name="l01608"></a>01608             INTO curstmt;
 <a name="l01609"></a>01609
 <a name="l01610"></a>01610         EXECUTE curstmt;
 <a name="l01611"></a>01611
 <a name="l01612"></a>01612         EXECUTE &#39;SELECT parent_id FROM selected_parent_ids_ebp LIMIT 1;&#39;
 <a name="l01613"></a>01613                  INTO num_parent_ids;
 <a name="l01614"></a>01614
 <a name="l01615"></a>01615         IF (num_parent_ids IS NULL)  THEN
 <a name="l01616"></a>01616             EXIT;
 <a name="l01617"></a>01617         END IF;
 <a name="l01618"></a>01618
 <a name="l01619"></a>01619         SELECT MADLIB_SCHEMA.__format
 <a name="l01620"></a>01620             (
 <a name="l01621"></a>01621                 &#39;DELETE FROM %
 <a name="l01622"></a>01622                 WHERE parent_id IN
 <a name="l01623"></a>01623                     (SELECT parent_id FROM selected_parent_ids_ebp)&#39;,
 <a name="l01624"></a>01624                 tree_table_name
 <a name="l01625"></a>01625             )
 <a name="l01626"></a>01626             INTO curstmt;
 <a name="l01627"></a>01627
 <a name="l01628"></a>01628         EXECUTE curstmt;
 <a name="l01629"></a>01629
 <a name="l01630"></a>01630         SELECT MADLIB_SCHEMA.__format
 <a name="l01631"></a>01631             (
 <a name="l01632"></a>01632                 &#39;UPDATE %
 <a name="l01633"></a>01633                 SET lmc_nid = NULL, lmc_fval = NULL
 <a name="l01634"></a>01634                 WHERE id IN
 <a name="l01635"></a>01635                     (SELECT parent_id FROM selected_parent_ids_ebp)&#39;,
 <a name="l01636"></a>01636                 tree_table_name
 <a name="l01637"></a>01637             )
 <a name="l01638"></a>01638             INTO curstmt;
 <a name="l01639"></a>01639
 <a name="l01640"></a>01640         EXECUTE curstmt;
 <a name="l01641"></a>01641
 <a name="l01642"></a>01642     END LOOP;
 <a name="l01643"></a>01643 END
 <a name="l01644"></a>01644 $$ LANGUAGE PLPGSQL;
 <a name="l01645"></a>01645
 <a name="l01646"></a>01646
 <a name="l01647"></a>01647 <span class="comment">/*</span>
 <a name="l01648"></a>01648 <span class="comment"> * @brief Generate the final trained tree.</span>
 <a name="l01649"></a>01649 <span class="comment"> *</span>
 <a name="l01650"></a>01650 <span class="comment"> * @param result_tree_table_name  The name of the table containing the tree.</span>
 <a name="l01651"></a>01651 <span class="comment"> *  </span>
 <a name="l01652"></a>01652 <span class="comment"> */</span>
 <a name="l01653"></a>01653 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__generate_final_tree
 <a name="l01654"></a>01654     (
 <a name="l01655"></a>01655     result_tree_table_name TEXT
 <a name="l01656"></a>01656     )
 <a name="l01657"></a>01657 RETURNS void AS $$
 <a name="l01658"></a>01658 DECLARE
 <a name="l01659"></a>01659     tree_size           INTEGER;
 <a name="l01660"></a>01660     curstmt             TEXT;
 <a name="l01661"></a>01661     num_redundant_nodes INTEGER;
 <a name="l01662"></a>01662 BEGIN
 <a name="l01663"></a>01663
 <a name="l01664"></a>01664     EXECUTE &#39; DELETE FROM &#39; || result_tree_table_name ||
 <a name="l01665"></a>01665             &#39; WHERE COALESCE(num_of_samples,0) = 0&#39;;
 <a name="l01666"></a>01666
 <a name="l01667"></a>01667     -- for each node, find the left most child node id and the feature value,
 <a name="l01668"></a>01668     -- and update the node&#39;s lmc_nid and lmc_fval column
 <a name="l01669"></a>01669     SELECT MADLIB_SCHEMA.__format
 <a name="l01670"></a>01670             (
 <a name="l01671"></a>01671                 &#39;UPDATE % k
 <a name="l01672"></a>01672                  SET lmc_nid = g.lmc_nid, lmc_fval = g.lmc_fval
 <a name="l01673"></a>01673                  FROM
 <a name="l01674"></a>01674                     (
 <a name="l01675"></a>01675                     SELECT parent_id,
 <a name="l01676"></a>01676                            min(id) as lmc_nid,
 <a name="l01677"></a>01677                            min(tree_location[array_upper(tree_location,1)])
 <a name="l01678"></a>01678                            as lmc_fval
 <a name="l01679"></a>01679                     FROM %
 <a name="l01680"></a>01680                     GROUP BY parent_id
 <a name="l01681"></a>01681                     ) g
 <a name="l01682"></a>01682                 WHERE k.id = g.parent_id&#39;,
 <a name="l01683"></a>01683                 ARRAY[
 <a name="l01684"></a>01684                     result_tree_table_name,
 <a name="l01685"></a>01685                     result_tree_table_name
 <a name="l01686"></a>01686                     ]
 <a name="l01687"></a>01687             )
 <a name="l01688"></a>01688     INTO curstmt;
 <a name="l01689"></a>01689     EXECUTE curstmt;
 <a name="l01690"></a>01690
 <a name="l01691"></a>01691     <span class="comment">/*</span>
 <a name="l01692"></a>01692 <span class="comment">     *  For a certain node, if all of its children are leaf nodes and have the </span>
 <a name="l01693"></a>01693 <span class="comment">     *  same class label, we can safely remove its children. After removal, we</span>
 <a name="l01694"></a>01694 <span class="comment">     *  should apply the same operation to the new leaf nodes until no nodes </span>
 <a name="l01695"></a>01695 <span class="comment">     *  meet this criterion.</span>
 <a name="l01696"></a>01696 <span class="comment">     */</span>
 <a name="l01697"></a>01697     LOOP
 <a name="l01698"></a>01698         EXECUTE &#39;DROP TABLE IF EXISTS trim_tree_aux_table CASCADE&#39;;
 <a name="l01699"></a>01699         -- Find nodes whose children should be removed.
 <a name="l01700"></a>01700         curstmt = MADLIB_SCHEMA.__format
 <a name="l01701"></a>01701             (
 <a name="l01702"></a>01702             &#39;CREATE TEMP TABLE trim_tree_aux_table AS
 <a name="l01703"></a>01703             SELECT parent_id FROM
 <a name="l01704"></a>01704             (
 <a name="l01705"></a>01705                 SELECT parent_id, count(distinct max_class) as class_count
 <a name="l01706"></a>01706                 FROM %
 <a name="l01707"></a>01707                 WHERE parent_id IN
 <a name="l01708"></a>01708                     (
 <a name="l01709"></a>01709                     SELECT parent_id FROM %
 <a name="l01710"></a>01710                     WHERE parent_id NOT IN
 <a name="l01711"></a>01711                         (
 <a name="l01712"></a>01712                             SELECT parent_id
 <a name="l01713"></a>01713                             FROM %
 <a name="l01714"></a>01714                             WHERE lmc_nid IS NOT NULL
 <a name="l01715"></a>01715                         ) and parent_id &lt;&gt; 0
 <a name="l01716"></a>01716                     )
 <a name="l01717"></a>01717                 GROUP BY parent_id
 <a name="l01718"></a>01718             ) l
 <a name="l01719"></a>01719             where l.class_count=1
 <a name="l01720"></a>01720             m4_ifdef(`__GREENPLUM__&#39;, `DISTRIBUTED BY (parent_id)&#39;)&#39;,
 <a name="l01721"></a>01721             ARRAY[
 <a name="l01722"></a>01722                 result_tree_table_name,
 <a name="l01723"></a>01723                 result_tree_table_name,
 <a name="l01724"></a>01724                 result_tree_table_name
 <a name="l01725"></a>01725                 ]
 <a name="l01726"></a>01726             );
 <a name="l01727"></a>01727         EXECUTE curstmt;
 <a name="l01728"></a>01728
 <a name="l01729"></a>01729         EXECUTE &#39;SELECT count(*) FROM trim_tree_aux_table&#39;
 <a name="l01730"></a>01730             INTO num_redundant_nodes;
 <a name="l01731"></a>01731
 <a name="l01732"></a>01732         IF (num_redundant_nodes &lt;= 0) THEN
 <a name="l01733"></a>01733             EXIT;
 <a name="l01734"></a>01734         END IF;
 <a name="l01735"></a>01735
 <a name="l01736"></a>01736         -- Delete the found redundant nodes.
 <a name="l01737"></a>01737         curstmt = MADLIB_SCHEMA.__format
 <a name="l01738"></a>01738             (
 <a name="l01739"></a>01739             &#39;
 <a name="l01740"></a>01740             DELETE FROM % t
 <a name="l01741"></a>01741             WHERE t.parent_id IN
 <a name="l01742"></a>01742             (SELECT parent_id FROM trim_tree_aux_table)&#39;,
 <a name="l01743"></a>01743             ARRAY[
 <a name="l01744"></a>01744                 result_tree_table_name
 <a name="l01745"></a>01745                 ]
 <a name="l01746"></a>01746             );
 <a name="l01747"></a>01747         EXECUTE curstmt;
 <a name="l01748"></a>01748
 <a name="l01749"></a>01749         -- Set the nodes, whose children are removed, to be leaf nodes.
 <a name="l01750"></a>01750         curstmt =  MADLIB_SCHEMA.__format
 <a name="l01751"></a>01751                 (
 <a name="l01752"></a>01752                 &#39;UPDATE % k
 <a name="l01753"></a>01753                  SET lmc_nid = NULL, lmc_fval = NULL
 <a name="l01754"></a>01754                  FROM
 <a name="l01755"></a>01755                     (
 <a name="l01756"></a>01756                     SELECT parent_id FROM trim_tree_aux_table
 <a name="l01757"></a>01757                     ) g
 <a name="l01758"></a>01758                  WHERE k.id = g.parent_id&#39;,
 <a name="l01759"></a>01759                 ARRAY[
 <a name="l01760"></a>01760                     result_tree_table_name
 <a name="l01761"></a>01761                     ]
 <a name="l01762"></a>01762                 );
 <a name="l01763"></a>01763         EXECUTE curstmt;
 <a name="l01764"></a>01764     END LOOP;
 <a name="l01765"></a>01765 END
 <a name="l01766"></a>01766 $$ LANGUAGE PLPGSQL;
 <a name="l01767"></a>01767
 <a name="l01768"></a>01768
 <a name="l01769"></a>01769 <span class="comment">/*</span>
 <a name="l01770"></a>01770 <span class="comment"> * The UDT for the training result.</span>
 <a name="l01771"></a>01771 <span class="comment"> *</span>
 <a name="l01772"></a>01772 <span class="comment"> *      num_of_samples           It means how many records there exists in the </span>
 <a name="l01773"></a>01773 <span class="comment"> *                               training set.   </span>
 <a name="l01774"></a>01774 <span class="comment"> *      features_per_node        The number of features chosen for each tree.</span>
 <a name="l01775"></a>01775 <span class="comment"> *      num_tree_nodes           The number of tree nodes.</span>
 <a name="l01776"></a>01776 <span class="comment"> *      max_tree_depth           The max tree depth.</span>
 <a name="l01777"></a>01777 <span class="comment"> *      calc_acc_time            Total time of calculating acc.</span>
 <a name="l01778"></a>01778 <span class="comment"> *      calc_pre_time            Time of preprocessing when calculating acc.</span>
 <a name="l01779"></a>01779 <span class="comment"> *      update_time              Total time of updating operation after found</span>
 <a name="l01780"></a>01780 <span class="comment"> *                               the best time. </span>
 <a name="l01781"></a>01781 <span class="comment"> *      update_best              Time of updating the best splits&#39; information.</span>
 <a name="l01782"></a>01782 <span class="comment"> *      update_child             Time of generating the child nodes.</span>
 <a name="l01783"></a>01783 <span class="comment"> *      update_nid               Time of updating the assigned node IDs.</span>
 <a name="l01784"></a>01784 <span class="comment"> *      scv_acs_time             Time of calculating the best splits.     </span>
 <a name="l01785"></a>01785 <span class="comment"> *      prune_time               Time of tree pruning.</span>
 <a name="l01786"></a>01786 <span class="comment"> *</span>
 <a name="l01787"></a>01787 <span class="comment"> */</span>
 <a name="l01788"></a>01788 DROP TYPE IF EXISTS MADLIB_SCHEMA.__train_result;
 <a name="l01789"></a>01789 CREATE TYPE MADLIB_SCHEMA.__train_result AS
 <a name="l01790"></a>01790 (
 <a name="l01791"></a>01791     num_of_samples           BIGINT,
 <a name="l01792"></a>01792     features_per_node        INT,
 <a name="l01793"></a>01793     num_tree_nodes           INT,
 <a name="l01794"></a>01794     max_tree_depth           INT,
 <a name="l01795"></a>01795     calc_acc_time            INTERVAL,
 <a name="l01796"></a>01796     calc_pre_time            INTERVAL,
 <a name="l01797"></a>01797     update_time              INTERVAL,
 <a name="l01798"></a>01798     update_best              INTERVAL,
 <a name="l01799"></a>01799     update_child             INTERVAL,
 <a name="l01800"></a>01800     update_nid               INTERVAL,
 <a name="l01801"></a>01801     scv_acs_time             INTERVAL,
 <a name="l01802"></a>01802     prune_time               INTERVAL
 <a name="l01803"></a>01803 );
 <a name="l01804"></a>01804
 <a name="l01805"></a>01805
 <a name="l01806"></a>01806 <span class="comment">/*</span>
 <a name="l01807"></a>01807 <span class="comment"> * @brief The function samples a set of integer values between low and high.</span>
 <a name="l01808"></a>01808 <span class="comment"> *</span>
 <a name="l01809"></a>01809 <span class="comment"> * @param num_of_samples  The number of records to be sampled.</span>
 <a name="l01810"></a>01810 <span class="comment"> * @param low             The low limit of sampled values.</span>
 <a name="l01811"></a>01811 <span class="comment"> * @param high            The high limit of sampled values.</span>
 <a name="l01812"></a>01812 <span class="comment"> *</span>
 <a name="l01813"></a>01813 <span class="comment"> * @return A set of integer values sampled randomly between [low, high].</span>
 <a name="l01814"></a>01814 <span class="comment"> *</span>
 <a name="l01815"></a>01815 <span class="comment"> */</span>
 <a name="l01816"></a>01816 DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__sample_within_range
 <a name="l01817"></a>01817     (
 <a name="l01818"></a>01818     BIGINT,
 <a name="l01819"></a>01819     BIGINT,
 <a name="l01820"></a>01820     BIGINT
 <a name="l01821"></a>01821     )CASCADE;
 <a name="l01822"></a>01822 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__sample_within_range
 <a name="l01823"></a>01823     (
 <a name="l01824"></a>01824     num_of_samples      BIGINT,
 <a name="l01825"></a>01825     low                 BIGINT,
 <a name="l01826"></a>01826     high                BIGINT
 <a name="l01827"></a>01827     )
 <a name="l01828"></a>01828 RETURNS SETOF BIGINT
 <a name="l01829"></a>01829 AS &#39;MODULE_PATHNAME&#39;, &#39;dt_sample_within_range&#39;
 <a name="l01830"></a>01830 LANGUAGE C STRICT VOLATILE;
 <a name="l01831"></a>01831
 <a name="l01832"></a>01832
 <a name="l01833"></a>01833 <span class="comment">/*</span>
 <a name="l01834"></a>01834 <span class="comment"> * @brief The function samples with replacement from source table and store</span>
 <a name="l01835"></a>01835 <span class="comment"> *        the results to target table.</span>
 <a name="l01836"></a>01836 <span class="comment"> * </span>
 <a name="l01837"></a>01837 <span class="comment"> *        In this function, we firstly calculate how many samples should be</span>
 <a name="l01838"></a>01838 <span class="comment"> *        generated in each segment. Then, we let those segments sample with</span>
 <a name="l01839"></a>01839 <span class="comment"> *        replacement between the maximum ID and minimum ID of the source table </span>
 <a name="l01840"></a>01840 <span class="comment"> *        in parallel and assign samples to different trees. </span>
 <a name="l01841"></a>01841 <span class="comment"> *</span>
 <a name="l01842"></a>01842 <span class="comment"> *        If there are gaps in the ID column of the source table, we sample</span>
 <a name="l01843"></a>01843 <span class="comment"> *        extra records in proportion to the number of gaps. At last, we remove</span>
 <a name="l01844"></a>01844 <span class="comment"> *        these invalid samples with an inner join operation with the source</span>
 <a name="l01845"></a>01845 <span class="comment"> *        table. Since we target big data, this strategy works quite well.</span>
 <a name="l01846"></a>01846 <span class="comment"> *</span>
 <a name="l01847"></a>01847 <span class="comment"> * @param num_of_tree     The number of trees to be trained.</span>
 <a name="l01848"></a>01848 <span class="comment"> * @param size_per_tree   The number of records to be sampled for each tree.</span>
 <a name="l01849"></a>01849 <span class="comment"> * @param src_table       The name of the table to be sampled from.</span>
 <a name="l01850"></a>01850 <span class="comment"> * @param target_table    The name of the table used to store the results.</span>
 <a name="l01851"></a>01851 <span class="comment"> *</span>
 <a name="l01852"></a>01852 <span class="comment"> */</span>
 <a name="l01853"></a>01853 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__sample_with_replacement
 <a name="l01854"></a>01854     (
 <a name="l01855"></a>01855     num_of_tree     INT,
 <a name="l01856"></a>01856     size_per_tree   INT,
 <a name="l01857"></a>01857     src_table       TEXT,
 <a name="l01858"></a>01858     target_table    TEXT
 <a name="l01859"></a>01859     )
 <a name="l01860"></a>01860 RETURNS VOID AS $$
 <a name="l01861"></a>01861 DECLARE
 <a name="l01862"></a>01862     segment_num     INT;
 <a name="l01863"></a>01863     sample_per_seg  INT;
 <a name="l01864"></a>01864     sample_ratio    FLOAT8;
 <a name="l01865"></a>01865     record_num      FLOAT8;
 <a name="l01866"></a>01866     min_id          INT;
 <a name="l01867"></a>01867     max_id          INT;
 <a name="l01868"></a>01868     range           FLOAT8;
 <a name="l01869"></a>01869     stmt            TEXT;
 <a name="l01870"></a>01870 BEGIN
 <a name="l01871"></a>01871
 <a name="l01872"></a>01872 m4_changequote(`&gt;&gt;&gt;&#39;, `&lt;&lt;&lt;&#39;)
 <a name="l01873"></a>01873 m4_ifdef(&gt;&gt;&gt;__GREENPLUM__&lt;&lt;&lt;, &gt;&gt;&gt;
 <a name="l01874"></a>01874     -- get the segment number
 <a name="l01875"></a>01875     SELECT COUNT(distinct content) FROM gp_segment_configuration
 <a name="l01876"></a>01876         WHERE content&lt;&gt;-1 INTO segment_num;
 <a name="l01877"></a>01877 &lt;&lt;&lt;, &gt;&gt;&gt;
 <a name="l01878"></a>01878     -- fix the segment number to 1 for PG
 <a name="l01879"></a>01879     segment_num = 1;
 <a name="l01880"></a>01880 &lt;&lt;&lt;)
 <a name="l01881"></a>01881 m4_changequote(&gt;&gt;&gt;`&lt;&lt;&lt;, &gt;&gt;&gt;&#39;&lt;&lt;&lt;)
 <a name="l01882"></a>01882
 <a name="l01883"></a>01883
 <a name="l01884"></a>01884     DROP TABLE IF EXISTS auxiliary_segment_table;
 <a name="l01885"></a>01885     CREATE TEMP TABLE auxiliary_segment_table
 <a name="l01886"></a>01886     (
 <a name="l01887"></a>01887         segment_id  INT
 <a name="l01888"></a>01888     ) m4_ifdef(`__GREENPLUM__&#39;, `DISTRIBUTED BY(segment_id)&#39;);
 <a name="l01889"></a>01889
 <a name="l01890"></a>01890     -- Insert segment_num of records distributed by segment id
 <a name="l01891"></a>01891     EXECUTE &#39;INSERT INTO auxiliary_segment_table
 <a name="l01892"></a>01892         SELECT generate_series(1,&#39;||segment_num||&#39;);&#39;;
 <a name="l01893"></a>01893
 <a name="l01894"></a>01894     EXECUTE &#39;SELECT max(id),min(id), count(id) as record_num
 <a name="l01895"></a>01895         FROM &#39;||src_table||&#39;;&#39; INTO max_id,min_id,record_num;
 <a name="l01896"></a>01896     range=max_id-min_id+1;
 <a name="l01897"></a>01897
 <a name="l01898"></a>01898     -- compute the sample ratio
 <a name="l01899"></a>01899     sample_ratio= range/record_num;
 <a name="l01900"></a>01900
 <a name="l01901"></a>01901     -- compute how many records should be sampled by each segment
 <a name="l01902"></a>01902     sample_per_seg=((sample_ratio*num_of_tree*size_per_tree)/segment_num)::INT;
 <a name="l01903"></a>01903
 <a name="l01904"></a>01904     -- add the weight field
 <a name="l01905"></a>01905
 <a name="l01906"></a>01906     IF (range &gt; record_num) THEN
 <a name="l01907"></a>01907         -- remove those invalid samples with join operation
 <a name="l01908"></a>01908         stmt = MADLIB_SCHEMA.__format
 <a name="l01909"></a>01909             (
 <a name="l01910"></a>01910             &#39;INSERT INTO %(id, tid, nid, weight)
 <a name="l01911"></a>01911               SELECT record_id,
 <a name="l01912"></a>01912                      tid AS tid,
 <a name="l01913"></a>01913                      tid AS nid,
 <a name="l01914"></a>01914                      count(*) AS weight
 <a name="l01915"></a>01915               FROM
 <a name="l01916"></a>01916                 (
 <a name="l01917"></a>01917                     SELECT  MADLIB_SCHEMA.__sample_within_range(%, %, %) AS record_id,
 <a name="l01918"></a>01918                             MADLIB_SCHEMA.__sample_within_range(%, 1, %) AS tid
 <a name="l01919"></a>01919                     FROM auxiliary_segment_table
 <a name="l01920"></a>01920                 ) t,
 <a name="l01921"></a>01921                 % k
 <a name="l01922"></a>01922               WHERE t.record_id=k.id
 <a name="l01923"></a>01923               GROUP BY record_id, tid, nid&#39;,
 <a name="l01924"></a>01924             ARRAY[
 <a name="l01925"></a>01925                 target_table,
 <a name="l01926"></a>01926                 sample_per_seg::TEXT,
 <a name="l01927"></a>01927                 min_id::TEXT,
 <a name="l01928"></a>01928                 max_id::TEXT,
 <a name="l01929"></a>01929                 sample_per_seg::TEXT,
 <a name="l01930"></a>01930                 num_of_tree::TEXT,
 <a name="l01931"></a>01931                 src_table
 <a name="l01932"></a>01932             ]
 <a name="l01933"></a>01933             );
 <a name="l01934"></a>01934     ELSE
 <a name="l01935"></a>01935         stmt = MADLIB_SCHEMA.__format
 <a name="l01936"></a>01936             (
 <a name="l01937"></a>01937             &#39;INSERT INTO %(id, tid, nid, weight)
 <a name="l01938"></a>01938               SELECT record_id,
 <a name="l01939"></a>01939                      tid AS tid,
 <a name="l01940"></a>01940                      tid AS nid,
 <a name="l01941"></a>01941                      count(*) AS weight
 <a name="l01942"></a>01942               FROM
 <a name="l01943"></a>01943                 (
 <a name="l01944"></a>01944                     SELECT  MADLIB_SCHEMA.__sample_within_range(%, %, %) AS record_id,
 <a name="l01945"></a>01945                             MADLIB_SCHEMA.__sample_within_range(%, 1, %) AS tid
 <a name="l01946"></a>01946                     FROM auxiliary_segment_table
 <a name="l01947"></a>01947                 ) t
 <a name="l01948"></a>01948               GROUP BY record_id, tid, nid&#39;,
 <a name="l01949"></a>01949             ARRAY[
 <a name="l01950"></a>01950                 target_table,
 <a name="l01951"></a>01951                 sample_per_seg::TEXT,
 <a name="l01952"></a>01952                 min_id::TEXT,
 <a name="l01953"></a>01953                 max_id::TEXT,
 <a name="l01954"></a>01954                 sample_per_seg::TEXT,
 <a name="l01955"></a>01955                 num_of_tree::TEXT
 <a name="l01956"></a>01956             ]
 <a name="l01957"></a>01957             );
 <a name="l01958"></a>01958     END IF;
 <a name="l01959"></a>01959
 <a name="l01960"></a>01960     EXECUTE stmt;
 <a name="l01961"></a>01961 END
 <a name="l01962"></a>01962 $$ LANGUAGE PLPGSQL VOLATILE;
 <a name="l01963"></a>01963
 <a name="l01964"></a>01964
 <a name="l01965"></a>01965 <span class="comment">/*</span>
 <a name="l01966"></a>01966 <span class="comment"> * @brief This function trains a decision tree or random forest.</span>
 <a name="l01967"></a>01967 <span class="comment"> *</span>
 <a name="l01968"></a>01968 <span class="comment"> * @param split_criterion             This parameter specifies which split criterion </span>
 <a name="l01969"></a>01969 <span class="comment"> *                                    should be used for tree construction and </span>
 <a name="l01970"></a>01970 <span class="comment"> *                                    pruning. The valid values are infogain, </span>
 <a name="l01971"></a>01971 <span class="comment"> *                                    gainratio, and gini.</span>
 <a name="l01972"></a>01972 <span class="comment"> * @param num_trees                   Total number of trees to be trained. </span>
 <a name="l01973"></a>01973 <span class="comment"> * @param features_per_node           Total number of features used to compute split </span>
 <a name="l01974"></a>01974 <span class="comment"> *                                    gain for each node. </span>
 <a name="l01975"></a>01975 <span class="comment"> * @param training_table_name         The name of the table/view with the source data. </span>
 <a name="l01976"></a>01976 <span class="comment"> * @param training_table_meta         The name of the table with the meta data. </span>
 <a name="l01977"></a>01977 <span class="comment"> * @param result_tree_table_name      The name of the table where the resulting </span>
 <a name="l01978"></a>01978 <span class="comment"> *                                    DT/RF will be stored. </span>
 <a name="l01979"></a>01979 <span class="comment"> * @param validation_table_name       The validation table used for pruning tree.  </span>
 <a name="l01980"></a>01980 <span class="comment"> * @param id_col_name                 The name of the column containing id of each point.  </span>
 <a name="l01981"></a>01981 <span class="comment"> * @param class_col_name              The name of the column containing correct class </span>
 <a name="l01982"></a>01982 <span class="comment"> *                                    of each point.  </span>
 <a name="l01983"></a>01983 <span class="comment"> * @param confidence_level            A statistical confidence interval of the </span>
 <a name="l01984"></a>01984 <span class="comment"> *                                    resubstitution error.  </span>
 <a name="l01985"></a>01985 <span class="comment"> * @param max_tree_depth              Maximum decision tree depth.  </span>
 <a name="l01986"></a>01986 <span class="comment"> * @param node_prune_threshold        Specifies the minimum number of samples required </span>
 <a name="l01987"></a>01987 <span class="comment"> *                                    in a child node.  </span>
 <a name="l01988"></a>01988 <span class="comment"> * @param node_split_threshold        Specifies the minimum number of samples required </span>
 <a name="l01989"></a>01989 <span class="comment"> *                                    in a node in order for a further split   </span>
 <a name="l01990"></a>01990 <span class="comment"> *                                    to be possible.  </span>
 <a name="l01991"></a>01991 <span class="comment"> * @param sampling_needed             Whether enabling the sampling functionality.  </span>
 <a name="l01992"></a>01992 <span class="comment"> * @param h2hmv_routine_id            Specifies how to handle missing values. </span>
 <a name="l01993"></a>01993 <span class="comment"> *                                    1 ignore, 2 explicit.</span>
 <a name="l01994"></a>01994 <span class="comment"> * @param verbosity                   &gt; 0 means this function runs in verbose mode. </span>
 <a name="l01995"></a>01995 <span class="comment"> *  </span>
 <a name="l01996"></a>01996 <span class="comment"> * @return The record including training related information.</span>
 <a name="l01997"></a>01997 <span class="comment"> *         Details please refer to the UDT: MADLIB_SCHEMA.__train_result.</span>
 <a name="l01998"></a>01998 <span class="comment"> *</span>
 <a name="l01999"></a>01999 <span class="comment"> */</span>
 <a name="l02000"></a>02000 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__train_tree
 <a name="l02001"></a>02001     (
 <a name="l02002"></a>02002     split_criterion         TEXT,
 <a name="l02003"></a>02003     num_trees               INT,
 <a name="l02004"></a>02004     features_per_node       INT,
 <a name="l02005"></a>02005     training_table_name     TEXT,
 <a name="l02006"></a>02006     training_table_meta     TEXT,
 <a name="l02007"></a>02007     result_tree_table_name  TEXT,
 <a name="l02008"></a>02008     validation_table_name   TEXT,
 <a name="l02009"></a>02009     id_col_name             TEXT,
 <a name="l02010"></a>02010     class_col_name          TEXT,
 <a name="l02011"></a>02011     confidence_level        FLOAT,
 <a name="l02012"></a>02012     max_tree_depth          INT,
 <a name="l02013"></a>02013     sampling_percentage     FLOAT,
 <a name="l02014"></a>02014     node_prune_threshold    FLOAT,
 <a name="l02015"></a>02015     node_split_threshold    FLOAT,
 <a name="l02016"></a>02016     sampling_needed         BOOLEAN,
 <a name="l02017"></a>02017     h2hmv_routine_id        INT,
 <a name="l02018"></a>02018     verbosity               INT
 <a name="l02019"></a>02019     )
 <a name="l02020"></a>02020 RETURNS MADLIB_SCHEMA.__train_result AS $$
 <a name="l02021"></a>02021 DECLARE
 <a name="l02022"></a>02022     num_live_nodes              INT;
 <a name="l02023"></a>02023     max_nid                     INT;
 <a name="l02024"></a>02024     location                    INT[];
 <a name="l02025"></a>02025     temp_location               INT[];
 <a name="l02026"></a>02026     num_classes                 INT;
 <a name="l02027"></a>02027     answer                      record;
 <a name="l02028"></a>02028     location_size               INT;
 <a name="l02029"></a>02029     begin_func_exec             TIMESTAMP;
 <a name="l02030"></a>02030     begin_find_best             TIMESTAMP;
 <a name="l02031"></a>02031     scv_acs_time                INTERVAL;
 <a name="l02032"></a>02032     begin_data_transfer         TIMESTAMP;
 <a name="l02033"></a>02033     begin_update_best           TIMESTAMP;
 <a name="l02034"></a>02034     begin_update_child          TIMESTAMP;
 <a name="l02035"></a>02035     begin_update_nid            TIMESTAMP;
 <a name="l02036"></a>02036     calc_update_best            INTERVAL;
 <a name="l02037"></a>02037     calc_update_child           INTERVAL;
 <a name="l02038"></a>02038     calc_update_nid             INTERVAL;
 <a name="l02039"></a>02039     ins_upd_time                INTERVAL;
 <a name="l02040"></a>02040     begin_olap_acs              TIMESTAMP;
 <a name="l02041"></a>02041     calc_acc_time               INTERVAL;
 <a name="l02042"></a>02042     calc_pre_time               INTERVAL;
 <a name="l02043"></a>02043     calc_olap_time              INTERVAL;
 <a name="l02044"></a>02044     begin_bld_assoc             TIMESTAMP;
 <a name="l02045"></a>02045     bld_assoc_time              INTERVAL;
 <a name="l02046"></a>02046     begin_prune                 TIMESTAMP;
 <a name="l02047"></a>02047     prune_time                  INTERVAL;
 <a name="l02048"></a>02048     total_size                  FLOAT;
 <a name="l02049"></a>02049     sc_code                     INT := 1;
 <a name="l02050"></a>02050     curstmt                     TEXT := &#39;&#39;;
 <a name="l02051"></a>02051     grow_tree                   INT := max_tree_depth;
 <a name="l02052"></a>02052     ret                         MADLIB_SCHEMA.__train_result;
 <a name="l02053"></a>02053     curr_level                  INT := 1;
 <a name="l02054"></a>02054     dp_ids                      INT[];
 <a name="l02055"></a>02055     dp_ids_text                 TEXT;
 <a name="l02056"></a>02056     instance_time               MADLIB_SCHEMA.__gen_acc_time;
 <a name="l02057"></a>02057     tr_table_index              INT := 1;
 <a name="l02058"></a>02058     tr_tables                   TEXT[] := &#39;{tr_assoc_ping, tr_assoc_pong}&#39;;
 <a name="l02059"></a>02059     cur_tr_table                TEXT := &#39;tr_assoc_ping&#39;;
 <a name="l02060"></a>02060     need_analyze                BOOL := &#39;t&#39;::BOOL;
 <a name="l02061"></a>02061     attr_count                  INT;
 <a name="l02062"></a>02062 BEGIN
 <a name="l02063"></a>02063     -- record the time costed in different steps when training
 <a name="l02064"></a>02064     begin_func_exec     = clock_timestamp();
 <a name="l02065"></a>02065     scv_acs_time        = begin_func_exec - begin_func_exec;
 <a name="l02066"></a>02066     calc_olap_time      = scv_acs_time;
 <a name="l02067"></a>02067     calc_acc_time       = scv_acs_time;
 <a name="l02068"></a>02068     calc_pre_time       = scv_acs_time;
 <a name="l02069"></a>02069     ins_upd_time        = scv_acs_time;
 <a name="l02070"></a>02070     calc_update_best    = scv_acs_time;
 <a name="l02071"></a>02071     calc_update_child   = scv_acs_time;
 <a name="l02072"></a>02072     calc_update_nid     = scv_acs_time;
 <a name="l02073"></a>02073     bld_assoc_time      = scv_acs_time;
 <a name="l02074"></a>02074     prune_time          = scv_acs_time;
 <a name="l02075"></a>02075
 <a name="l02076"></a>02076     IF(split_criterion = &#39;infogain&#39;) THEN
 <a name="l02077"></a>02077         sc_code = 1;
 <a name="l02078"></a>02078     ELSIF (split_criterion = &#39;gainratio&#39;) THEN
 <a name="l02079"></a>02079         sc_code = 2;
 <a name="l02080"></a>02080     ELSIF (split_criterion = &#39;gini&#39;) THEN
 <a name="l02081"></a>02081         sc_code = 3;
 <a name="l02082"></a>02082     ELSE
 <a name="l02083"></a>02083         RAISE EXCEPTION &#39;%&#39;, &#39;Invalid split criterion!&#39;;
 <a name="l02084"></a>02084     END IF;
 <a name="l02085"></a>02085
 <a name="l02086"></a>02086     num_classes = MADLIB_SCHEMA.__num_of_class(training_table_meta);
 <a name="l02087"></a>02087
 <a name="l02088"></a>02088     IF(verbosity &gt; 0) THEN
 <a name="l02089"></a>02089         RAISE INFO &#39;NUMBER OF CLASSES IN THE TRAINING SET %&#39;, num_classes;
 <a name="l02090"></a>02090     END IF;
 <a name="l02091"></a>02091
 <a name="l02092"></a>02092     IF(num_classes &lt; 2) THEN
 <a name="l02093"></a>02093         RAISE EXCEPTION &#39;the number of classes must be greater than 2&#39;;
 <a name="l02094"></a>02094     END IF;
 <a name="l02095"></a>02095
 <a name="l02096"></a>02096     curstmt = MADLIB_SCHEMA.__format
 <a name="l02097"></a>02097         (
 <a name="l02098"></a>02098             &#39;SELECT
 <a name="l02099"></a>02099                 count(*)
 <a name="l02100"></a>02100              FROM %
 <a name="l02101"></a>02101              WHERE column_type=&#39;&#39;f&#39;&#39;&#39;,
 <a name="l02102"></a>02102             training_table_meta
 <a name="l02103"></a>02103         );
 <a name="l02104"></a>02104     EXECUTE curstmt INTO attr_count;
 <a name="l02105"></a>02105
 <a name="l02106"></a>02106     -- generate the horizontal table for updating assinged node IDs
 <a name="l02107"></a>02107     PERFORM MADLIB_SCHEMA.__gen_horizontal_encoded_table
 <a name="l02108"></a>02108         (
 <a name="l02109"></a>02109             &#39;tmp_dt_hori_table&#39;,
 <a name="l02110"></a>02110             training_table_name,
 <a name="l02111"></a>02111             attr_count,
 <a name="l02112"></a>02112             verbosity
 <a name="l02113"></a>02113         );
 <a name="l02114"></a>02114
 <a name="l02115"></a>02115     EXECUTE &#39;SELECT count(*) FROM tmp_dt_hori_table&#39; INTO total_size;
 <a name="l02116"></a>02116
 <a name="l02117"></a>02117     IF(verbosity &gt; 0) THEN
 <a name="l02118"></a>02118         RAISE INFO &#39;INPUT TABLE SIZE: %&#39;, total_size;
 <a name="l02119"></a>02119     END IF;
 <a name="l02120"></a>02120
 <a name="l02121"></a>02121     begin_bld_assoc = clock_timestamp();
 <a name="l02122"></a>02122     cur_tr_table = tr_tables[tr_table_index];
 <a name="l02123"></a>02123
 <a name="l02124"></a>02124     -- The table of tr_assoc holds the information of which records are
 <a name="l02125"></a>02125     -- used during training for each tree.
 <a name="l02126"></a>02126     -- It has four columns.
 <a name="l02127"></a>02127     --     id     --   The id of one record.
 <a name="l02128"></a>02128     --     tid    --   The id of a tree.
 <a name="l02129"></a>02129     --     nid    --   The id of a node in a tree.
 <a name="l02130"></a>02130     --     weight --   The times a record is assigned to a node.
 <a name="l02131"></a>02131     IF (sampling_needed) THEN
 <a name="l02132"></a>02132         PERFORM MADLIB_SCHEMA.__sample_with_replacement
 <a name="l02133"></a>02133             (
 <a name="l02134"></a>02134             num_trees,
 <a name="l02135"></a>02135             round(sampling_percentage * total_size)::INT,
 <a name="l02136"></a>02136             &#39;tmp_dt_hori_table&#39;,
 <a name="l02137"></a>02137             cur_tr_table
 <a name="l02138"></a>02138             );
 <a name="l02139"></a>02139     ELSE
 <a name="l02140"></a>02140         curstmt = MADLIB_SCHEMA.__format
 <a name="l02141"></a>02141              (
 <a name="l02142"></a>02142                 &#39;INSERT INTO %
 <a name="l02143"></a>02143                  SELECT id, 1 as tid, 1 as nid, 1 as weight
 <a name="l02144"></a>02144                  FROM %&#39;,
 <a name="l02145"></a>02145                  ARRAY[
 <a name="l02146"></a>02146                     cur_tr_table,
 <a name="l02147"></a>02147                     &#39;tmp_dt_hori_table&#39;
 <a name="l02148"></a>02148                 ]
 <a name="l02149"></a>02149              );
 <a name="l02150"></a>02150         EXECUTE curstmt;
 <a name="l02151"></a>02151     END IF;
 <a name="l02152"></a>02152
 <a name="l02153"></a>02153     -- analyze ping
 <a name="l02154"></a>02154     EXECUTE &#39;ANALYZE &#39; || cur_tr_table;
 <a name="l02155"></a>02155     bld_assoc_time = clock_timestamp() - begin_bld_assoc;
 <a name="l02156"></a>02156
 <a name="l02157"></a>02157     -- generate the root node for all trees.
 <a name="l02158"></a>02158     -- the generated numbers are the same for the two generate_series
 <a name="l02159"></a>02159     SELECT MADLIB_SCHEMA.__format
 <a name="l02160"></a>02160         (
 <a name="l02161"></a>02161             &#39;INSERT INTO %
 <a name="l02162"></a>02162                 (id, tree_location, feature, probability, max_class,scv,
 <a name="l02163"></a>02163                  live, num_of_samples, parent_id, tid)
 <a name="l02164"></a>02164             SELECT generate_series(1, %), ARRAY[0], 0, 1, 1, 1, 1, 0, 0,
 <a name="l02165"></a>02165                    generate_series(1, %)&#39;,
 <a name="l02166"></a>02166             ARRAY[
 <a name="l02167"></a>02167                 result_tree_table_name,
 <a name="l02168"></a>02168                 num_trees::TEXT,
 <a name="l02169"></a>02169                 num_trees::TEXT
 <a name="l02170"></a>02170             ]
 <a name="l02171"></a>02171         ) INTO curstmt;
 <a name="l02172"></a>02172
 <a name="l02173"></a>02173     EXECUTE curstmt;
 <a name="l02174"></a>02174
 <a name="l02175"></a>02175     max_nid         = num_trees;
 <a name="l02176"></a>02176     location_size   = 0;
 <a name="l02177"></a>02177
 <a name="l02178"></a>02178
 <a name="l02179"></a>02179     LOOP
 <a name="l02180"></a>02180         EXECUTE &#39;SELECT COUNT(id) FROM &#39; || result_tree_table_name ||
 <a name="l02181"></a>02181             &#39; WHERE live &gt; 0 AND array_upper(tree_location,1)=&#39;||
 <a name="l02182"></a>02182             curr_level||&#39;;&#39; INTO num_live_nodes;
 <a name="l02183"></a>02183
 <a name="l02184"></a>02184         IF (num_live_nodes &lt; 1) THEN
 <a name="l02185"></a>02185             IF(verbosity &gt; 0) THEN
 <a name="l02186"></a>02186                 RAISE INFO &#39;EXIT: %&#39;, &#39;no live nodes to split&#39;;
 <a name="l02187"></a>02187             END IF;
 <a name="l02188"></a>02188
 <a name="l02189"></a>02189             EXIT;
 <a name="l02190"></a>02190         END IF;
 <a name="l02191"></a>02191
 <a name="l02192"></a>02192         IF (verbosity &gt; 0) THEN
 <a name="l02193"></a>02193             RAISE INFO &#39;Running on level:%&#39;, curr_level;
 <a name="l02194"></a>02194         END IF;
 <a name="l02195"></a>02195
 <a name="l02196"></a>02196         begin_olap_acs = clock_timestamp();
 <a name="l02197"></a>02197
 <a name="l02198"></a>02198         instance_time = MADLIB_SCHEMA.__gen_acc
 <a name="l02199"></a>02199             (
 <a name="l02200"></a>02200             training_table_name,
 <a name="l02201"></a>02201             training_table_meta,
 <a name="l02202"></a>02202             result_tree_table_name,
 <a name="l02203"></a>02203             cur_tr_table,
 <a name="l02204"></a>02204             &#39;sf_assoc&#39;,
 <a name="l02205"></a>02205             features_per_node,
 <a name="l02206"></a>02206             num_classes,
 <a name="l02207"></a>02207             sampling_needed,
 <a name="l02208"></a>02208             verbosity
 <a name="l02209"></a>02209             );
 <a name="l02210"></a>02210
 <a name="l02211"></a>02211         IF (h2hmv_routine_id=1) THEN
 <a name="l02212"></a>02212             -- For ignore, we need the true size of nodes to handle the
 <a name="l02213"></a>02213             -- missing values.
 <a name="l02214"></a>02214             TRUNCATE node_size_aux;
 <a name="l02215"></a>02215
 <a name="l02216"></a>02216             curstmt = MADLIB_SCHEMA.__format
 <a name="l02217"></a>02217                 (
 <a name="l02218"></a>02218                     &#39;INSERT INTO node_size_aux
 <a name="l02219"></a>02219                      SELECT tr.tid, tr.nid, sum(weight) as count
 <a name="l02220"></a>02220                      FROM % tr
 <a name="l02221"></a>02221                      GROUP BY tr.tid, tr.nid&#39;,
 <a name="l02222"></a>02222                     cur_tr_table
 <a name="l02223"></a>02223                 );
 <a name="l02224"></a>02224
 <a name="l02225"></a>02225             EXECUTE curstmt;
 <a name="l02226"></a>02226         END IF;
 <a name="l02227"></a>02227
 <a name="l02228"></a>02228         calc_pre_time  = calc_pre_time + instance_time.calc_pre_time;
 <a name="l02229"></a>02229         calc_acc_time  = calc_acc_time + instance_time.calc_acc_time;
 <a name="l02230"></a>02230         calc_olap_time = calc_olap_time + (clock_timestamp() - begin_olap_acs);
 <a name="l02231"></a>02231
 <a name="l02232"></a>02232         curr_level = curr_level + 1;
 <a name="l02233"></a>02233
 <a name="l02234"></a>02234         begin_find_best = clock_timestamp();
 <a name="l02235"></a>02235
 <a name="l02236"></a>02236         PERFORM MADLIB_SCHEMA.__find_best_split
 <a name="l02237"></a>02237                (
 <a name="l02238"></a>02238                &#39;training_instance&#39;,
 <a name="l02239"></a>02239                confidence_level,
 <a name="l02240"></a>02240                training_table_meta,
 <a name="l02241"></a>02241                sc_code,
 <a name="l02242"></a>02242                grow_tree,
 <a name="l02243"></a>02243                &#39;find_best_answer_table&#39;,
 <a name="l02244"></a>02244                h2hmv_routine_id,
 <a name="l02245"></a>02245                num_classes
 <a name="l02246"></a>02246                );
 <a name="l02247"></a>02247         IF (verbosity &gt; 0) THEN
 <a name="l02248"></a>02248             RAISE INFO &#39;find best time at this level:%&#39;,
 <a name="l02249"></a>02249                 clock_timestamp() - begin_find_best;
 <a name="l02250"></a>02250         END IF;
 <a name="l02251"></a>02251         grow_tree = grow_tree - 1;
 <a name="l02252"></a>02252
 <a name="l02253"></a>02253         scv_acs_time        = scv_acs_time +
 <a name="l02254"></a>02254                               (clock_timestamp() - begin_find_best);
 <a name="l02255"></a>02255         begin_data_transfer = clock_timestamp();
 <a name="l02256"></a>02256         begin_update_best   = clock_timestamp();
 <a name="l02257"></a>02257
 <a name="l02258"></a>02258         -- We get the calculation result for current level.
 <a name="l02259"></a>02259         -- Update the nodes of previous level firstly.
 <a name="l02260"></a>02260         SELECT MADLIB_SCHEMA.__format
 <a name="l02261"></a>02261             (
 <a name="l02262"></a>02262                 &#39;UPDATE % t
 <a name="l02263"></a>02263                  SET feature        = c.feature,
 <a name="l02264"></a>02264                      probability    = c.probability,
 <a name="l02265"></a>02265                      max_class      = c.max_class,
 <a name="l02266"></a>02266                      scv            = c.max_scv,
 <a name="l02267"></a>02267                      ebp_coeff      = c.ebp_coeff,
 <a name="l02268"></a>02268                      num_of_samples = c.node_size,
 <a name="l02269"></a>02269                      live           = 0,
 <a name="l02270"></a>02270                      is_cont        = c.is_cont,
 <a name="l02271"></a>02271                      split_value    = c.split_value
 <a name="l02272"></a>02272                  FROM find_best_answer_table c
 <a name="l02273"></a>02273                  WHERE t.id=c.node_id AND t.tid=c.tid&#39;,
 <a name="l02274"></a>02274                  ARRAY[
 <a name="l02275"></a>02275                     result_tree_table_name::TEXT
 <a name="l02276"></a>02276                  ]
 <a name="l02277"></a>02277              ) INTO curstmt;
 <a name="l02278"></a>02278
 <a name="l02279"></a>02279         EXECUTE curstmt;
 <a name="l02280"></a>02280
 <a name="l02281"></a>02281         calc_update_best    = calc_update_best +
 <a name="l02282"></a>02282                               (clock_timestamp() - begin_update_best);
 <a name="l02283"></a>02283         begin_update_child  = clock_timestamp();
 <a name="l02284"></a>02284
 <a name="l02285"></a>02285         curstmt=
 <a name="l02286"></a>02286             MADLIB_SCHEMA.__format(
 <a name="l02287"></a>02287                    &#39;INSERT INTO %(id, tree_location, feature, probability,
 <a name="l02288"></a>02288                         max_class, scv, live, parent_id, tid, dp_ids)
 <a name="l02289"></a>02289                         SELECT %+row, array_append(tree_location, fval),
 <a name="l02290"></a>02290                             0, 1, 1, 1, %, ans.node_id, ans.tid,
 <a name="l02291"></a>02291                             CASE when(NOT ans.is_cont) then
 <a name="l02292"></a>02292                                 array_append( dp_ids, ans.feature)
 <a name="l02293"></a>02293                             ELSE
 <a name="l02294"></a>02294                                 dp_ids
 <a name="l02295"></a>02295                             END
 <a name="l02296"></a>02296                         FROM % tree,
 <a name="l02297"></a>02297                         (
 <a name="l02298"></a>02298                             SELECT  *,
 <a name="l02299"></a>02299                                     row_number()
 <a name="l02300"></a>02300                                     OVER (ORDER BY l.tid, l.node_id, l.fval) AS row
 <a name="l02301"></a>02301                             FROM
 <a name="l02302"></a>02302                             (
 <a name="l02303"></a>02303                                 SELECT  *,
 <a name="l02304"></a>02304                                         CASE WHEN (is_cont) THEN
 <a name="l02305"></a>02305                                             generate_series(1,2)
 <a name="l02306"></a>02306                                         ELSE
 <a name="l02307"></a>02307                                             generate_series(1, distinct_features)
 <a name="l02308"></a>02308                                         END AS fval
 <a name="l02309"></a>02309                                 FROM
 <a name="l02310"></a>02310                                 find_best_answer_table
 <a name="l02311"></a>02311                                 WHERE live&gt;0 AND coalesce(feature, 0) &lt;&gt; 0
 <a name="l02312"></a>02312                                       AND node_size &gt;= % AND node_size &gt;= %
 <a name="l02313"></a>02313                             ) l
 <a name="l02314"></a>02314                         ) ans
 <a name="l02315"></a>02315                         WHERE tree.id=ans.node_id and tree.tid=ans.tid;&#39;,
 <a name="l02316"></a>02316                         ARRAY[
 <a name="l02317"></a>02317                             result_tree_table_name,
 <a name="l02318"></a>02318                             (max_nid)::TEXT,
 <a name="l02319"></a>02319                             curr_level::TEXT,
 <a name="l02320"></a>02320                             result_tree_table_name,
 <a name="l02321"></a>02321                             (total_size * node_prune_threshold)::TEXT,
 <a name="l02322"></a>02322                             (total_size * node_split_threshold)::TEXT
 <a name="l02323"></a>02323                         ]
 <a name="l02324"></a>02324                         );
 <a name="l02325"></a>02325         IF(verbosity &gt; 0) THEN
 <a name="l02326"></a>02326             RAISE INFO &#39;Generate Child Nodes:%&#39;, curstmt;
 <a name="l02327"></a>02327         END IF;
 <a name="l02328"></a>02328
 <a name="l02329"></a>02329         EXECUTE curstmt;
 <a name="l02330"></a>02330
 <a name="l02331"></a>02331         EXECUTE &#39;SELECT max(id) FROM &#39;||result_tree_table_name INTO max_nid;
 <a name="l02332"></a>02332
 <a name="l02333"></a>02333         IF(verbosity &gt; 0) THEN
 <a name="l02334"></a>02334             RAISE INFO &#39;Max nid:%, level:%&#39;, max_nid, curr_level;
 <a name="l02335"></a>02335         END IF;
 <a name="l02336"></a>02336
 <a name="l02337"></a>02337         -- insert the leftmost child node id and relevant info
 <a name="l02338"></a>02338         -- to the assoc_aux table, so that we will make use of this
 <a name="l02339"></a>02339         -- info to  update the assigned nid the samples belong to
 <a name="l02340"></a>02340         -- the current node whose id is answer.node_id.
 <a name="l02341"></a>02341         SELECT MADLIB_SCHEMA.__format
 <a name="l02342"></a>02342             (
 <a name="l02343"></a>02343                 &#39;INSERT INTO assoc_aux
 <a name="l02344"></a>02344                  (nid, fid, lmc_id, svalue, is_cont)
 <a name="l02345"></a>02345                     SELECT t.id, t.feature, min(l.id),
 <a name="l02346"></a>02346                             t.split_value, t.is_cont
 <a name="l02347"></a>02347                     FROM
 <a name="l02348"></a>02348                         (SELECT id, parent_id
 <a name="l02349"></a>02349                         FROM %
 <a name="l02350"></a>02350                         WHERE array_upper(tree_location,1)=%) l,
 <a name="l02351"></a>02351                         % t
 <a name="l02352"></a>02352                     WHERE l.parent_id=t.id
 <a name="l02353"></a>02353                     GROUP BY t.id, t.feature, t.split_value, t.is_cont;&#39;,
 <a name="l02354"></a>02354                 ARRAY[
 <a name="l02355"></a>02355                     result_tree_table_name,
 <a name="l02356"></a>02356                     curr_level::TEXT,
 <a name="l02357"></a>02357                     result_tree_table_name
 <a name="l02358"></a>02358                 ]
 <a name="l02359"></a>02359             ) INTO curstmt;
 <a name="l02360"></a>02360
 <a name="l02361"></a>02361         IF(verbosity &gt; 0) THEN
 <a name="l02362"></a>02362             RAISE INFO &#39;Update lmc_child Info:%&#39;, curstmt;
 <a name="l02363"></a>02363         END IF;
 <a name="l02364"></a>02364
 <a name="l02365"></a>02365         EXECUTE curstmt;
 <a name="l02366"></a>02366
 <a name="l02367"></a>02367         -- delete the unused nodes on the previous level
 <a name="l02368"></a>02368         -- delete those nodes with a size less than node_prune_threshold
 <a name="l02369"></a>02369         -- node_prune_threshold will not apply to root node,
 <a name="l02370"></a>02370         -- the level is 1 (curr_level - 1 = 1);
 <a name="l02371"></a>02371         IF (curr_level &gt; 2) THEN
 <a name="l02372"></a>02372             curstmt = MADLIB_SCHEMA.__format
 <a name="l02373"></a>02373                         (
 <a name="l02374"></a>02374                             &#39;DELETE FROM % t
 <a name="l02375"></a>02375                              WHERE t.num_of_samples &lt; % OR live = %;&#39;,
 <a name="l02376"></a>02376                             ARRAY[
 <a name="l02377"></a>02377                                 result_tree_table_name::TEXT,
 <a name="l02378"></a>02378                                 (total_size * node_prune_threshold)::TEXT,
 <a name="l02379"></a>02379                                 (curr_level - 1)::TEXT
 <a name="l02380"></a>02380                             ]
 <a name="l02381"></a>02381                         );
 <a name="l02382"></a>02382             EXECUTE curstmt;
 <a name="l02383"></a>02383         END IF;
 <a name="l02384"></a>02384
 <a name="l02385"></a>02385         calc_update_child   = calc_update_child + (clock_timestamp() - begin_update_child);
 <a name="l02386"></a>02386         begin_update_nid    = clock_timestamp();
 <a name="l02387"></a>02387
 <a name="l02388"></a>02388         -- update the assigned node id for each sample on the current level
 <a name="l02389"></a>02389         tr_table_index = (tr_table_index % 2) + 1;
 <a name="l02390"></a>02390         curstmt = MADLIB_SCHEMA.__format
 <a name="l02391"></a>02391             (
 <a name="l02392"></a>02392                 &#39;INSERT INTO % (id, nid, tid, weight)
 <a name="l02393"></a>02393                  SELECT
 <a name="l02394"></a>02394                     tr.id,
 <a name="l02395"></a>02395                     au.lmc_id - 1 +
 <a name="l02396"></a>02396                     CASE WHEN (au.is_cont) THEN
 <a name="l02397"></a>02397                             CASE WHEN (svalue &lt; vt.fvals[au.fid]) THEN
 <a name="l02398"></a>02398                                 2
 <a name="l02399"></a>02399                             ELSE
 <a name="l02400"></a>02400                                 1
 <a name="l02401"></a>02401                             END
 <a name="l02402"></a>02402                     ELSE
 <a name="l02403"></a>02403                         vt.fvals[au.fid]::INT
 <a name="l02404"></a>02404                     END AS nid,
 <a name="l02405"></a>02405                     tid, weight
 <a name="l02406"></a>02406                   FROM % tr, % vt, assoc_aux au
 <a name="l02407"></a>02407                   WHERE tr.nid = au.nid AND vt.id = tr.id AND vt.fvals[au.fid] IS NOT NULL&#39;,
 <a name="l02408"></a>02408                 ARRAY[
 <a name="l02409"></a>02409                     tr_tables[tr_table_index],
 <a name="l02410"></a>02410                     cur_tr_table,
 <a name="l02411"></a>02411                     &#39;tmp_dt_hori_table&#39;
 <a name="l02412"></a>02412                 ]
 <a name="l02413"></a>02413             );
 <a name="l02414"></a>02414         IF (verbosity &gt; 0) THEN
 <a name="l02415"></a>02415             RAISE INFO &#39;%&#39;, curstmt;
 <a name="l02416"></a>02416         END IF;
 <a name="l02417"></a>02417
 <a name="l02418"></a>02418         EXECUTE curstmt;
 <a name="l02419"></a>02419         EXECUTE &#39;TRUNCATE &#39; || cur_tr_table;
 <a name="l02420"></a>02420         cur_tr_table = tr_tables[tr_table_index];
 <a name="l02421"></a>02421
 <a name="l02422"></a>02422         IF (need_analyze) THEN
 <a name="l02423"></a>02423             -- analyze pong table
 <a name="l02424"></a>02424             EXECUTE &#39;ANALYZE &#39; || cur_tr_table;
 <a name="l02425"></a>02425             need_analyze = &#39;f&#39;::BOOL;
 <a name="l02426"></a>02426         END IF;
 <a name="l02427"></a>02427
 <a name="l02428"></a>02428         EXECUTE &#39;TRUNCATE assoc_aux&#39;;
 <a name="l02429"></a>02429
 <a name="l02430"></a>02430         calc_update_nid = calc_update_nid + (clock_timestamp() - begin_update_nid);
 <a name="l02431"></a>02431
 <a name="l02432"></a>02432         ins_upd_time = ins_upd_time +
 <a name="l02433"></a>02433             (clock_timestamp() - begin_data_transfer);
 <a name="l02434"></a>02434         IF(verbosity &gt; 0) THEN
 <a name="l02435"></a>02435             RAISE INFO &#39;computation time in this level:%&#39;,
 <a name="l02436"></a>02436                 (clock_timestamp() - begin_find_best);
 <a name="l02437"></a>02437         END IF;
 <a name="l02438"></a>02438
 <a name="l02439"></a>02439     END LOOP;
 <a name="l02440"></a>02440
 <a name="l02441"></a>02441     PERFORM MADLIB_SCHEMA.__generate_final_tree(result_tree_table_name);
 <a name="l02442"></a>02442
 <a name="l02443"></a>02443     begin_prune = clock_timestamp();
 <a name="l02444"></a>02444     IF (confidence_level &lt; 100.0) THEN
 <a name="l02445"></a>02445        PERFORM MADLIB_SCHEMA.__ebp_prune_tree(result_tree_table_name);
 <a name="l02446"></a>02446     END IF;
 <a name="l02447"></a>02447
 <a name="l02448"></a>02448     IF (validation_table_name IS NOT NULL) THEN
 <a name="l02449"></a>02449        PERFORM MADLIB_SCHEMA.__rep_prune_tree
 <a name="l02450"></a>02450                   (
 <a name="l02451"></a>02451                   result_tree_table_name,
 <a name="l02452"></a>02452                   validation_table_name ,
 <a name="l02453"></a>02453                   num_classes
 <a name="l02454"></a>02454                   );
 <a name="l02455"></a>02455     END IF;
 <a name="l02456"></a>02456     prune_time = clock_timestamp() - begin_prune;
 <a name="l02457"></a>02457
 <a name="l02458"></a>02458     IF(verbosity &gt; 0) THEN
 <a name="l02459"></a>02459         RAISE INFO &#39;time of sampling with replacement: %&#39;, bld_assoc_time;
 <a name="l02460"></a>02460         RAISE INFO &#39;time of finding best and calculating ACS: %&#39;, scv_acs_time;
 <a name="l02461"></a>02461         RAISE INFO &#39;time of calculating ACC: %&#39;, calc_acc_time;
 <a name="l02462"></a>02462         RAISE INFO &#39;time of Insert/update operation: %&#39;, ins_upd_time;
 <a name="l02463"></a>02463         RAISE INFO &#39;time of pruning: %&#39;, prune_time;
 <a name="l02464"></a>02464         RAISE INFO &#39;time of training: %&#39;, clock_timestamp() - begin_func_exec;
 <a name="l02465"></a>02465     END IF;
 <a name="l02466"></a>02466
 <a name="l02467"></a>02467     SELECT MADLIB_SCHEMA.__format
 <a name="l02468"></a>02468         (
 <a name="l02469"></a>02469             &#39;SELECT COUNT(id), max(array_upper(tree_location, 1))
 <a name="l02470"></a>02470              FROM %&#39;,
 <a name="l02471"></a>02471              ARRAY[
 <a name="l02472"></a>02472                 result_tree_table_name
 <a name="l02473"></a>02473              ]
 <a name="l02474"></a>02474         ) INTO curstmt;
 <a name="l02475"></a>02475
 <a name="l02476"></a>02476     EXECUTE curstmt INTO ret.num_tree_nodes, ret.max_tree_depth;
 <a name="l02477"></a>02477
 <a name="l02478"></a>02478     ret.features_per_node   = features_per_node;
 <a name="l02479"></a>02479     ret.num_of_samples        = total_size;
 <a name="l02480"></a>02480     ret.calc_acc_time       = calc_acc_time;
 <a name="l02481"></a>02481     ret.calc_pre_time       = calc_pre_time;
 <a name="l02482"></a>02482     ret.update_time         = ins_upd_time;
 <a name="l02483"></a>02483     ret.update_best         = calc_update_best;
 <a name="l02484"></a>02484     ret.update_child        = calc_update_child;
 <a name="l02485"></a>02485     ret.update_nid          = calc_update_nid;
 <a name="l02486"></a>02486     ret.scv_acs_time        = scv_acs_time;
 <a name="l02487"></a>02487     ret.prune_time          = prune_time;
 <a name="l02488"></a>02488
 <a name="l02489"></a>02489     RETURN ret;
 <a name="l02490"></a>02490 END
 <a name="l02491"></a>02491 $$ LANGUAGE PLPGSQL;
 <a name="l02492"></a>02492
 <a name="l02493"></a>02493
 <a name="l02494"></a>02494 <span class="comment">/*</span>
 <a name="l02495"></a>02495 <span class="comment"> * @brief This is an internal function for displaying one tree node in human  </span>
 <a name="l02496"></a>02496 <span class="comment"> *        readable format. It is the step function of aggregation named </span>
 <a name="l02497"></a>02497 <span class="comment"> *        __display_tree_aggr.</span>
 <a name="l02498"></a>02498 <span class="comment"> *</span>
 <a name="l02499"></a>02499 <span class="comment"> * @param state     This variable is used to store the accumulated tree </span>
 <a name="l02500"></a>02500 <span class="comment"> *                  display information.</span>
 <a name="l02501"></a>02501 <span class="comment"> * @param depth     The depth of this node. </span>
 <a name="l02502"></a>02502 <span class="comment"> * @param is_cont   Whether the feature used to split is continuous. </span>
 <a name="l02503"></a>02503 <span class="comment"> * @param feat_name The name of the feature used to split.</span>
 <a name="l02504"></a>02504 <span class="comment"> * @param curr_val  The value of the splitting feature for this node.</span>
 <a name="l02505"></a>02505 <span class="comment"> * @param split_value    For continuous feature, it specifies the split value. </span>
 <a name="l02506"></a>02506 <span class="comment"> *                  Otherwise, it is of no meaning.</span>
 <a name="l02507"></a>02507 <span class="comment"> * @param max_prob  For those elements in this node, the probability that</span>
 <a name="l02508"></a>02508 <span class="comment"> *                  an element belongs to the max_class.</span>
 <a name="l02509"></a>02509 <span class="comment"> * @param max_class The class ID with the largest number of elements </span>
 <a name="l02510"></a>02510 <span class="comment"> *                  for those elements in this node.</span>
 <a name="l02511"></a>02511 <span class="comment"> * @param num_of_samples Total count of samples in this node. </span>
 <a name="l02512"></a>02512 <span class="comment"> *</span>
 <a name="l02513"></a>02513 <span class="comment"> * @return It returns the text containing the information of human  </span>
 <a name="l02514"></a>02514 <span class="comment"> *         readable information for trees.</span>
 <a name="l02515"></a>02515 <span class="comment"> *</span>
 <a name="l02516"></a>02516 <span class="comment"> */</span>
 <a name="l02517"></a>02517 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__display_node_sfunc
 <a name="l02518"></a>02518     (
 <a name="l02519"></a>02519     state           TEXT,
 <a name="l02520"></a>02520     depth           INT,
 <a name="l02521"></a>02521     is_cont         BOOLEAN,
 <a name="l02522"></a>02522     feat_name       TEXT,
 <a name="l02523"></a>02523     curr_val        TEXT,
 <a name="l02524"></a>02524     split_value     FLOAT8,
 <a name="l02525"></a>02525     max_prob        FLOAT8,
 <a name="l02526"></a>02526     max_class       TEXT,
 <a name="l02527"></a>02527     num_of_samples  INT
 <a name="l02528"></a>02528     )
 <a name="l02529"></a>02529 RETURNS TEXT AS $$
 <a name="l02530"></a>02530 DECLARE
 <a name="l02531"></a>02531     ret                     TEXT := &#39;&#39;;
 <a name="l02532"></a>02532     index                   INT;
 <a name="l02533"></a>02533 BEGIN
 <a name="l02534"></a>02534     -- We add indentation based on the depth.
 <a name="l02535"></a>02535     FOR index IN 0..depth LOOP
 <a name="l02536"></a>02536         ret = ret || &#39;    &#39;;
 <a name="l02537"></a>02537     END LOOP;
 <a name="l02538"></a>02538
 <a name="l02539"></a>02539     IF (depth &gt; 0) THEN
 <a name="l02540"></a>02540         ret = ret ||coalesce(feat_name,&#39;null&#39;)||&#39;: &#39;;
 <a name="l02541"></a>02541         -- For continuous features, there are two splits.
 <a name="l02542"></a>02542         -- We will mark curr_val to 1 for &#39;&lt;=&#39;. Otherwise,
 <a name="l02543"></a>02543         -- we will mark curr_val to 2.
 <a name="l02544"></a>02544         IF (is_cont) THEN
 <a name="l02545"></a>02545             IF (curr_val::INT = 1) THEN
 <a name="l02546"></a>02546                 ret = ret || &#39; &lt;= &#39;;
 <a name="l02547"></a>02547             ELSE
 <a name="l02548"></a>02548                 ret = ret || &#39; &gt; &#39;;
 <a name="l02549"></a>02549             END IF;
 <a name="l02550"></a>02550             ret = ret||coalesce(split_value,0)||&#39; &#39;;
 <a name="l02551"></a>02551         ELSE
 <a name="l02552"></a>02552             ret = ret||&#39; = &#39;||coalesce(curr_val,&#39;null&#39;)||&#39; &#39;;
 <a name="l02553"></a>02553         END IF;
 <a name="l02554"></a>02554     ELSE
 <a name="l02555"></a>02555         ret = ret||&#39;Root Node &#39;;
 <a name="l02556"></a>02556     END IF;
 <a name="l02557"></a>02557
 <a name="l02558"></a>02558     ret = ret                               ||
 <a name="l02559"></a>02559           &#39; : class(&#39;                       ||
 <a name="l02560"></a>02560           coalesce(max_class,null)          ||
 <a name="l02561"></a>02561           &#39;)   num_elements(&#39;               ||
 <a name="l02562"></a>02562           coalesce(num_of_samples,0)        ||
 <a name="l02563"></a>02563           &#39;)  predict_prob(&#39;                ||
 <a name="l02564"></a>02564           coalesce(max_prob,0)              ||
 <a name="l02565"></a>02565           &#39;)&#39;;
 <a name="l02566"></a>02566
 <a name="l02567"></a>02567     ret = ret || E&#39;\n&#39;;
 <a name="l02568"></a>02568
 <a name="l02569"></a>02569     -- If there exists information, append the information
 <a name="l02570"></a>02570     -- for this node.
 <a name="l02571"></a>02571     IF (state IS NOT NULL) THEN
 <a name="l02572"></a>02572         ret = state || ret;
 <a name="l02573"></a>02573     END IF;
 <a name="l02574"></a>02574
 <a name="l02575"></a>02575     RETURN ret;
 <a name="l02576"></a>02576 END
 <a name="l02577"></a>02577 $$ LANGUAGE PLPGSQL;
 <a name="l02578"></a>02578
 <a name="l02579"></a>02579
 <a name="l02580"></a>02580 DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__display_tree_aggr
 <a name="l02581"></a>02581     (
 <a name="l02582"></a>02582     INT,        -- depth
 <a name="l02583"></a>02583     BOOLEAN,    -- is_cont
 <a name="l02584"></a>02584     TEXT,       -- feature name
 <a name="l02585"></a>02585     TEXT,       -- curr_val
 <a name="l02586"></a>02586     FLOAT8,     -- split value
 <a name="l02587"></a>02587     FLOAT8,     -- max_probability
 <a name="l02588"></a>02588     TEXT,       -- max_class
 <a name="l02589"></a>02589     INT         -- num_of_samples
 <a name="l02590"></a>02590     ) CASCADE;
 <a name="l02591"></a>02591 CREATE
 <a name="l02592"></a>02592 m4_ifdef(`__GREENPLUM__&#39;, m4_ifdef(`__HAS_ORDERED_AGGREGATES__&#39;, `ORDERED&#39;))
 <a name="l02593"></a>02593 AGGREGATE MADLIB_SCHEMA.__display_tree_aggr
 <a name="l02594"></a>02594     (
 <a name="l02595"></a>02595     INT,        -- depth
 <a name="l02596"></a>02596     BOOLEAN,    -- is_cont
 <a name="l02597"></a>02597     TEXT,       -- feature name
 <a name="l02598"></a>02598     TEXT,       -- curr_val
 <a name="l02599"></a>02599     FLOAT8,     -- split value
 <a name="l02600"></a>02600     FLOAT8,     -- max_probability
 <a name="l02601"></a>02601     TEXT,       -- max_class
 <a name="l02602"></a>02602     INT         -- num_of_samples
 <a name="l02603"></a>02603     )
 <a name="l02604"></a>02604 (
 <a name="l02605"></a>02605   SFUNC=MADLIB_SCHEMA.__display_node_sfunc,
 <a name="l02606"></a>02606   STYPE=TEXT
 <a name="l02607"></a>02607 );
 <a name="l02608"></a>02608
 <a name="l02609"></a>02609
 <a name="l02610"></a>02610 <span class="comment">/*</span>
 <a name="l02611"></a>02611 <span class="comment"> * @brief Display the trained model with human readable format. This function</span>
 <a name="l02612"></a>02612 <span class="comment"> *        leverages ordered aggregate to display the tree with only one scan of</span>
 <a name="l02613"></a>02613 <span class="comment"> *        the tree_table.</span>
 <a name="l02614"></a>02614 <span class="comment"> *</span>
 <a name="l02615"></a>02615 <span class="comment"> * @param tree_table  The full name of the tree table. </span>
 <a name="l02616"></a>02616 <span class="comment"> * @param tree_id     The array contains the IDs of the trees to be displayed.</span>
 <a name="l02617"></a>02617 <span class="comment"> * @param max_depth   The max depth to be displayed. If it is set to null,</span>
 <a name="l02618"></a>02618 <span class="comment"> *                    this function will show all levels. </span>
 <a name="l02619"></a>02619 <span class="comment"> *</span>
 <a name="l02620"></a>02620 <span class="comment"> * @return The text representing the tree with human readable format.</span>
 <a name="l02621"></a>02621 <span class="comment"> *</span>
 <a name="l02622"></a>02622 <span class="comment"> */</span>
 <a name="l02623"></a>02623 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_display_with_ordered_aggr
 <a name="l02624"></a>02624     (
 <a name="l02625"></a>02625     tree_table  TEXT,
 <a name="l02626"></a>02626     tree_id     INT[],
 <a name="l02627"></a>02627     max_depth   INT
 <a name="l02628"></a>02628     )
 <a name="l02629"></a>02629 RETURNS SETOF TEXT AS $$
 <a name="l02630"></a>02630 DECLARE
 <a name="l02631"></a>02631     metatable_name  TEXT := null;
 <a name="l02632"></a>02632     curr_stmt       TEXT := null;
 <a name="l02633"></a>02633     feature_name    TEXT := null;
 <a name="l02634"></a>02634     table_name      TEXT := null;
 <a name="l02635"></a>02635     result          TEXT := &#39;&#39;;
 <a name="l02636"></a>02636     result_rec      RECORD;
 <a name="l02637"></a>02637 BEGIN
 <a name="l02638"></a>02638     PERFORM MADLIB_SCHEMA.__assert_table
 <a name="l02639"></a>02639             (
 <a name="l02640"></a>02640                 tree_table,
 <a name="l02641"></a>02641                 &#39;t&#39;
 <a name="l02642"></a>02642             );
 <a name="l02643"></a>02643
 <a name="l02644"></a>02644     metatable_name = MADLIB_SCHEMA.__get_metatable_name( tree_table );
 <a name="l02645"></a>02645
 <a name="l02646"></a>02646     -- This table is used for tree display.
 <a name="l02647"></a>02647     -- It is filled with the original information before
 <a name="l02648"></a>02648     -- encoding to facilitate the display procedure.
 <a name="l02649"></a>02649     DROP TABLE IF EXISTS auxiliary_tree_display;
 <a name="l02650"></a>02650     CREATE TEMP TABLE auxiliary_tree_display
 <a name="l02651"></a>02651     (
 <a name="l02652"></a>02652         tid                     INT,
 <a name="l02653"></a>02653         id                      INT,
 <a name="l02654"></a>02654         tree_location           INT[],
 <a name="l02655"></a>02655         probability             FLOAT8,
 <a name="l02656"></a>02656         max_class               TEXT,
 <a name="l02657"></a>02657         num_of_samples          INT,
 <a name="l02658"></a>02658         parent_id               INT,
 <a name="l02659"></a>02659         curr_value              TEXT,
 <a name="l02660"></a>02660         parent_feature_id       INT,
 <a name="l02661"></a>02661         is_parent_feature_cont  BOOLEAN,
 <a name="l02662"></a>02662         parent_split_value      FLOAT8,
 <a name="l02663"></a>02663         parent_feature_name     TEXT
 <a name="l02664"></a>02664     ) m4_ifdef(`__GREENPLUM__&#39;, `DISTRIBUTED BY (id)&#39;);
 <a name="l02665"></a>02665
 <a name="l02666"></a>02666     -- We made a self join for the tree table. For each node, we get the
 <a name="l02667"></a>02667     -- feature information at its parent node so as to display this node.
 <a name="l02668"></a>02668     SELECT MADLIB_SCHEMA.__format(
 <a name="l02669"></a>02669         &#39;INSERT INTO auxiliary_tree_display SELECT m.*,
 <a name="l02670"></a>02670         n.column_name as parent_feature_name
 <a name="l02671"></a>02671         FROM
 <a name="l02672"></a>02672         (SELECT * FROM
 <a name="l02673"></a>02673             (SELECT t1.tid,t1.id, t1.tree_location,
 <a name="l02674"></a>02674             t1.probability,t1.max_class::TEXT,
 <a name="l02675"></a>02675             t1.num_of_samples,t1.parent_id,
 <a name="l02676"></a>02676             t1.tree_location[array_upper(t1.tree_location,1)]::TEXT
 <a name="l02677"></a>02677                 as curr_value,
 <a name="l02678"></a>02678             t2.feature as parent_feature_id,
 <a name="l02679"></a>02679             t2.is_cont as is_parent_feature_cont,
 <a name="l02680"></a>02680             t2.split_value as parent_split_value
 <a name="l02681"></a>02681             FROM % t1 LEFT JOIN % t2 ON
 <a name="l02682"></a>02682             (t1.parent_id = t2.id AND
 <a name="l02683"></a>02683             (coalesce(t1.tid,0)=coalesce(t2.tid,0)) ) ) l
 <a name="l02684"></a>02684             WHERE l.tid in ( % ) ) m
 <a name="l02685"></a>02685          LEFT JOIN % n
 <a name="l02686"></a>02686             on m.parent_feature_id = n.id;&#39;,
 <a name="l02687"></a>02687         ARRAY[
 <a name="l02688"></a>02688             tree_table,
 <a name="l02689"></a>02689             tree_table,
 <a name="l02690"></a>02690             array_to_string(tree_id,&#39;,&#39;),
 <a name="l02691"></a>02691             metatable_name
 <a name="l02692"></a>02692         ]
 <a name="l02693"></a>02693         )
 <a name="l02694"></a>02694     INTO curr_stmt;
 <a name="l02695"></a>02695     EXECUTE curr_stmt;
 <a name="l02696"></a>02696
 <a name="l02697"></a>02697     -- Get the metatable storing the encoding information of class.
 <a name="l02698"></a>02698     SELECT MADLIB_SCHEMA.__format
 <a name="l02699"></a>02699         (
 <a name="l02700"></a>02700             &#39;SELECT
 <a name="l02701"></a>02701                 column_name,
 <a name="l02702"></a>02702                 MADLIB_SCHEMA.__regclass_to_text(table_oid) as table_name
 <a name="l02703"></a>02703              FROM  %
 <a name="l02704"></a>02704              WHERE column_type=&#39;&#39;c&#39;&#39; LIMIT 1&#39;,
 <a name="l02705"></a>02705             ARRAY[
 <a name="l02706"></a>02706                 metatable_name
 <a name="l02707"></a>02707             ]
 <a name="l02708"></a>02708         ) INTO curr_stmt;
 <a name="l02709"></a>02709
 <a name="l02710"></a>02710     EXECUTE curr_stmt INTO result_rec;
 <a name="l02711"></a>02711
 <a name="l02712"></a>02712     table_name = result_rec.table_name;
 <a name="l02713"></a>02713
 <a name="l02714"></a>02714     IF (table_name IS NOT NULL) THEN
 <a name="l02715"></a>02715         -- Convert back for the class column.
 <a name="l02716"></a>02716         SELECT MADLIB_SCHEMA.__format(
 <a name="l02717"></a>02717             &#39;UPDATE auxiliary_tree_display n
 <a name="l02718"></a>02718              SET max_class = MADLIB_SCHEMA.__to_char(m.fval)
 <a name="l02719"></a>02719              FROM % m
 <a name="l02720"></a>02720              WHERE m.code = n.max_class::INT
 <a name="l02721"></a>02721             &#39;,
 <a name="l02722"></a>02722             ARRAY[
 <a name="l02723"></a>02723                 table_name
 <a name="l02724"></a>02724             ]
 <a name="l02725"></a>02725             )
 <a name="l02726"></a>02726         INTO curr_stmt;
 <a name="l02727"></a>02727         EXECUTE curr_stmt;
 <a name="l02728"></a>02728     END IF;
 <a name="l02729"></a>02729
 <a name="l02730"></a>02730     -- Get the metatables storing the encoding information for discrete features.
 <a name="l02731"></a>02731     SELECT MADLIB_SCHEMA.__format
 <a name="l02732"></a>02732         (
 <a name="l02733"></a>02733             &#39;SELECT
 <a name="l02734"></a>02734         id,
 <a name="l02735"></a>02735                 column_name,
 <a name="l02736"></a>02736                 MADLIB_SCHEMA.__regclass_to_text(table_oid) as table_name
 <a name="l02737"></a>02737              FROM %
 <a name="l02738"></a>02738              WHERE NOT is_cont AND column_type=&#39;&#39;f&#39;&#39;;&#39;,
 <a name="l02739"></a>02739             ARRAY[
 <a name="l02740"></a>02740                 metatable_name
 <a name="l02741"></a>02741             ]
 <a name="l02742"></a>02742         )
 <a name="l02743"></a>02743     INTO curr_stmt;
 <a name="l02744"></a>02744
 <a name="l02745"></a>02745     -- Convert back for discrete features.
 <a name="l02746"></a>02746     FOR result_rec IN EXECUTE (curr_stmt) LOOP
 <a name="l02747"></a>02747         SELECT MADLIB_SCHEMA.__format(
 <a name="l02748"></a>02748             &#39;UPDATE auxiliary_tree_display n
 <a name="l02749"></a>02749              SET curr_value = MADLIB_SCHEMA.__to_char(m.fval)
 <a name="l02750"></a>02750              FROM % m
 <a name="l02751"></a>02751              WHERE m.code::INT = n.curr_value::INT AND
 <a name="l02752"></a>02752            m.fid = %              AND
 <a name="l02753"></a>02753                    n.parent_feature_name = %
 <a name="l02754"></a>02754             &#39;,
 <a name="l02755"></a>02755             ARRAY[
 <a name="l02756"></a>02756                 result_rec.table_name,
 <a name="l02757"></a>02757         result_rec.id::TEXT,
 <a name="l02758"></a>02758                 quote_literal(result_rec.column_name)
 <a name="l02759"></a>02759             ]
 <a name="l02760"></a>02760             )
 <a name="l02761"></a>02761         INTO curr_stmt;
 <a name="l02762"></a>02762         EXECUTE curr_stmt;
 <a name="l02763"></a>02763     END LOOP;
 <a name="l02764"></a>02764
 <a name="l02765"></a>02765     -- Now we already get all the information. Invoke the
 <a name="l02766"></a>02766     -- aggregation to show the tree.
 <a name="l02767"></a>02767     -- If we order by tree_location, we can get the sequence
 <a name="l02768"></a>02768     -- of depth first traversal.
 <a name="l02769"></a>02769     curr_stmt = &#39;SELECT tid,MADLIB_SCHEMA.__display_tree_aggr(
 <a name="l02770"></a>02770                 array_upper(tree_location,1)-1,
 <a name="l02771"></a>02771                 is_parent_feature_cont,
 <a name="l02772"></a>02772                 parent_feature_name,
 <a name="l02773"></a>02773                 curr_value,
 <a name="l02774"></a>02774                 parent_split_value,
 <a name="l02775"></a>02775                 probability,
 <a name="l02776"></a>02776                 max_class,
 <a name="l02777"></a>02777                 num_of_samples
 <a name="l02778"></a>02778                 order by tree_location) AS disp_str
 <a name="l02779"></a>02779          FROM auxiliary_tree_display&#39;;
 <a name="l02780"></a>02780
 <a name="l02781"></a>02781     IF (max_depth IS NOT NULL) THEN
 <a name="l02782"></a>02782         curr_stmt = curr_stmt                                   ||
 <a name="l02783"></a>02783                     &#39; WHERE array_upper(tree_location,1) - 1 &lt;=&#39;  ||
 <a name="l02784"></a>02784                     max_depth;
 <a name="l02785"></a>02785     END IF;
 <a name="l02786"></a>02786
 <a name="l02787"></a>02787     curr_stmt = curr_stmt||&#39; GROUP BY tid ORDER BY tid;&#39;;
 <a name="l02788"></a>02788
 <a name="l02789"></a>02789     FOR result_rec IN EXECUTE curr_stmt LOOP
 <a name="l02790"></a>02790         SELECT MADLIB_SCHEMA.__format(
 <a name="l02791"></a>02791             E&#39;\nTree %\n%&#39;,
 <a name="l02792"></a>02792             ARRAY[
 <a name="l02793"></a>02793                 result_rec.tid::TEXT,
 <a name="l02794"></a>02794                 result_rec.disp_str
 <a name="l02795"></a>02795             ]
 <a name="l02796"></a>02796             )
 <a name="l02797"></a>02797         INTO result;
 <a name="l02798"></a>02798         RETURN NEXT result;
 <a name="l02799"></a>02799         --RETURN NEXT E&#39;\nTree &#39;||result_rec.tid||E&#39;\n&#39;||result_rec.disp_str;
 <a name="l02800"></a>02800     END LOOP;
 <a name="l02801"></a>02801     RETURN;
 <a name="l02802"></a>02802 END $$ LANGUAGE PLPGSQL;
 <a name="l02803"></a>02803
 <a name="l02804"></a>02804
 <a name="l02805"></a>02805 <span class="comment">/*</span>
 <a name="l02806"></a>02806 <span class="comment"> * @brief This is an internal function for displaying the tree in human readable</span>
 <a name="l02807"></a>02807 <span class="comment"> *        format. It uses the depth-first strategy to traverse a tree and print </span>
 <a name="l02808"></a>02808 <span class="comment"> *        values. This function is used on databases, e.g. PG 8.4, that do not </span>
 <a name="l02809"></a>02809 <span class="comment"> *        support ordered aggregate.</span>
 <a name="l02810"></a>02810 <span class="comment"> *</span>
 <a name="l02811"></a>02811 <span class="comment"> * @param tree_table      The full name of the tree table. </span>
 <a name="l02812"></a>02812 <span class="comment"> * @param id              The ID of current node. This node and all of its  </span>
 <a name="l02813"></a>02813 <span class="comment"> *                        children are displayed.</span>
 <a name="l02814"></a>02814 <span class="comment"> * @param feature_id      The ID of a feature, which is used to split in the </span>
 <a name="l02815"></a>02815 <span class="comment"> *                        parent of current node.</span>
 <a name="l02816"></a>02816 <span class="comment"> * @param depth           The depth of current node.</span>
 <a name="l02817"></a>02817 <span class="comment"> * @param is_cont         It specifies whether the feature denoted by &#39;feature_id&#39;</span>
 <a name="l02818"></a>02818 <span class="comment"> *                        is continuous or not.</span>
 <a name="l02819"></a>02819 <span class="comment"> * @param split_value     For continuous feature, it specifies the split value. </span>
 <a name="l02820"></a>02820 <span class="comment"> *                        Otherwise, it is of no meaning.</span>
 <a name="l02821"></a>02821 <span class="comment"> * @param metatable_name  For tabular format, this table contains the meta data</span>
 <a name="l02822"></a>02822 <span class="comment"> *                        to encode the input table.</span>
 <a name="l02823"></a>02823 <span class="comment"> * @param max_depth       The max depth to be displayed. If it is set to null,</span>
 <a name="l02824"></a>02824 <span class="comment"> *                        this function will show all levels. </span>
 <a name="l02825"></a>02825 <span class="comment"> * @param tree_id         The ID of the tree to be displayed.</span>
 <a name="l02826"></a>02826 <span class="comment"> *</span>
 <a name="l02827"></a>02827 <span class="comment"> * @return The text representing the tree with human readable format.</span>
 <a name="l02828"></a>02828 <span class="comment"> *</span>
 <a name="l02829"></a>02829 <span class="comment"> */</span>
 <a name="l02830"></a>02830 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__display_tree_no_ordered_aggr
 <a name="l02831"></a>02831     (
 <a name="l02832"></a>02832     tree_table      TEXT,
 <a name="l02833"></a>02833     id              INT,
 <a name="l02834"></a>02834     feature_id      INT,
 <a name="l02835"></a>02835     depth           INT,
 <a name="l02836"></a>02836     is_cont         BOOLEAN,
 <a name="l02837"></a>02837     split_value     FLOAT,
 <a name="l02838"></a>02838     metatable_name  TEXT,
 <a name="l02839"></a>02839     max_depth       INT,
 <a name="l02840"></a>02840     tree_id         INT
 <a name="l02841"></a>02841     )
 <a name="l02842"></a>02842 RETURNS TEXT AS $$
 <a name="l02843"></a>02843 DECLARE
 <a name="l02844"></a>02844     ret                     TEXT := &#39;&#39;;
 <a name="l02845"></a>02845     tree_location           INT[];
 <a name="l02846"></a>02846     feature                 INT;
 <a name="l02847"></a>02847     max_class               INT;
 <a name="l02848"></a>02848     num_of_samples          INT;
 <a name="l02849"></a>02849     is_cont                 BOOLEAN;
 <a name="l02850"></a>02850     temp_split_value        FLOAT;
 <a name="l02851"></a>02851     index                   INT;
 <a name="l02852"></a>02852     curr_value              INT;
 <a name="l02853"></a>02853     probability             FLOAT;
 <a name="l02854"></a>02854     curstmt                 TEXT;
 <a name="l02855"></a>02855     child_nid               INT;
 <a name="l02856"></a>02856 BEGIN
 <a name="l02857"></a>02857     IF (id IS NULL OR id &lt;= 0) THEN
 <a name="l02858"></a>02858         RETURN ret;
 <a name="l02859"></a>02859     END IF;
 <a name="l02860"></a>02860
 <a name="l02861"></a>02861     SELECT MADLIB_SCHEMA.__format
 <a name="l02862"></a>02862             (
 <a name="l02863"></a>02863                 &#39;SELECT tree_location, feature, is_cont,
 <a name="l02864"></a>02864                         split_value, max_class,num_of_samples,probability
 <a name="l02865"></a>02865                  FROM %
 <a name="l02866"></a>02866                  WHERE id = % AND tid=%&#39;,
 <a name="l02867"></a>02867                  ARRAY[
 <a name="l02868"></a>02868                     tree_table,
 <a name="l02869"></a>02869                     MADLIB_SCHEMA.__to_char(id),
 <a name="l02870"></a>02870                     MADLIB_SCHEMA.__to_char(tree_id)
 <a name="l02871"></a>02871                  ]
 <a name="l02872"></a>02872              )
 <a name="l02873"></a>02873     INTO curstmt;
 <a name="l02874"></a>02874
 <a name="l02875"></a>02875     EXECUTE curstmt INTO tree_location, feature, is_cont,
 <a name="l02876"></a>02876                          temp_split_value, max_class, num_of_samples, probability;
 <a name="l02877"></a>02877
 <a name="l02878"></a>02878     curr_value = tree_location[array_upper(tree_location,1)];
 <a name="l02879"></a>02879
 <a name="l02880"></a>02880     FOR index IN 0..depth LOOP
 <a name="l02881"></a>02881         ret = ret || &#39;    &#39;;
 <a name="l02882"></a>02882     END LOOP;
 <a name="l02883"></a>02883
 <a name="l02884"></a>02884     IF (id &gt;tree_id) THEN
 <a name="l02885"></a>02885         ret = ret ||MADLIB_SCHEMA.__get_feature_name(feature_id,metatable_name)||&#39;: &#39;;
 <a name="l02886"></a>02886
 <a name="l02887"></a>02887         IF (is_cont) THEN
 <a name="l02888"></a>02888             IF (curr_value = 1) THEN
 <a name="l02889"></a>02889                 ret = ret || &#39; &lt;= &#39;;
 <a name="l02890"></a>02890             ELSE
 <a name="l02891"></a>02891                 ret = ret || &#39; &gt; &#39;;
 <a name="l02892"></a>02892             END IF;
 <a name="l02893"></a>02893             ret = ret || split_value;
 <a name="l02894"></a>02894         ELSE
 <a name="l02895"></a>02895             ret = ret   ||
 <a name="l02896"></a>02896                   &#39; = &#39; ||
 <a name="l02897"></a>02897                   MADLIB_SCHEMA.__get_feature_value
 <a name="l02898"></a>02898                     (
 <a name="l02899"></a>02899                     feature_id,
 <a name="l02900"></a>02900                     curr_value,
 <a name="l02901"></a>02901                     metatable_name
 <a name="l02902"></a>02902                     );
 <a name="l02903"></a>02903         END IF;
 <a name="l02904"></a>02904     ELSE
 <a name="l02905"></a>02905         ret = ret||&#39;Root Node &#39;;
 <a name="l02906"></a>02906     END IF;
 <a name="l02907"></a>02907
 <a name="l02908"></a>02908     ret = ret                                                       ||
 <a name="l02909"></a>02909           &#39; : class(&#39;                                               ||
 <a name="l02910"></a>02910           MADLIB_SCHEMA.__get_class_value(max_class,metatable_name)  ||
 <a name="l02911"></a>02911           &#39;)   num_elements(&#39;                                       ||
 <a name="l02912"></a>02912           num_of_samples                                                 ||
 <a name="l02913"></a>02913           &#39;)  predict_prob(&#39;                                        ||
 <a name="l02914"></a>02914           probability                                               ||
 <a name="l02915"></a>02915           &#39;)&#39;;
 <a name="l02916"></a>02916
 <a name="l02917"></a>02917     ret = ret || E&#39;\n&#39;;
 <a name="l02918"></a>02918
 <a name="l02919"></a>02919     IF (max_depth IS NOT NULL AND
 <a name="l02920"></a>02920         depth &gt;= max_depth) THEN
 <a name="l02921"></a>02921         RETURN ret;
 <a name="l02922"></a>02922     END IF;
 <a name="l02923"></a>02923
 <a name="l02924"></a>02924     curstmt = MADLIB_SCHEMA.__format
 <a name="l02925"></a>02925                 (
 <a name="l02926"></a>02926                     &#39;SELECT id
 <a name="l02927"></a>02927                      FROM %
 <a name="l02928"></a>02928                      WHERE parent_id = % AND tid=%
 <a name="l02929"></a>02929                      ORDER BY id&#39;,
 <a name="l02930"></a>02930                     ARRAY[
 <a name="l02931"></a>02931                         tree_table,
 <a name="l02932"></a>02932                         MADLIB_SCHEMA.__to_char(id),
 <a name="l02933"></a>02933                         MADLIB_SCHEMA.__to_char(tree_id)
 <a name="l02934"></a>02934                         ]
 <a name="l02935"></a>02935                 );
 <a name="l02936"></a>02936
 <a name="l02937"></a>02937     FOR child_nid IN EXECUTE curstmt LOOP
 <a name="l02938"></a>02938         ret = ret || MADLIB_SCHEMA.__display_tree_no_ordered_aggr(
 <a name="l02939"></a>02939                         tree_table,
 <a name="l02940"></a>02940                         child_nid,
 <a name="l02941"></a>02941                         feature,
 <a name="l02942"></a>02942                         depth + 1,
 <a name="l02943"></a>02943                         is_cont,
 <a name="l02944"></a>02944                         temp_split_value,
 <a name="l02945"></a>02945                         metatable_name,
 <a name="l02946"></a>02946                         max_depth,
 <a name="l02947"></a>02947                         tree_id);
 <a name="l02948"></a>02948     END LOOP;
 <a name="l02949"></a>02949
 <a name="l02950"></a>02950     RETURN ret;
 <a name="l02951"></a>02951 END $$ LANGUAGE PLPGSQL;
 <a name="l02952"></a>02952
 <a name="l02953"></a>02953
 <a name="l02954"></a>02954 <span class="comment">/*</span>
 <a name="l02955"></a>02955 <span class="comment"> * @brief Display the trained model with human readable format. It use the </span>
 <a name="l02956"></a>02956 <span class="comment"> *        recursive algorithm, which is slower than the version with </span>
 <a name="l02957"></a>02957 <span class="comment"> *        ordered aggregate. We only use it when ordered aggregate is unavailable.</span>
 <a name="l02958"></a>02958 <span class="comment"> *</span>
 <a name="l02959"></a>02959 <span class="comment"> * @param tree_table  The full name of the tree table. </span>
 <a name="l02960"></a>02960 <span class="comment"> * @param tree_id     The array contains the IDs of the trees to be displayed.</span>
 <a name="l02961"></a>02961 <span class="comment"> * @param max_depth   The max depth to be displayed. If it is set to null,</span>
 <a name="l02962"></a>02962 <span class="comment"> *                    this function will show all levels. </span>
 <a name="l02963"></a>02963 <span class="comment"> *</span>
 <a name="l02964"></a>02964 <span class="comment"> * @return The text representing the tree with human readable format.</span>
 <a name="l02965"></a>02965 <span class="comment"> *</span>
 <a name="l02966"></a>02966 <span class="comment"> */</span>
 <a name="l02967"></a>02967 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_display_no_ordered_aggr
 <a name="l02968"></a>02968     (
 <a name="l02969"></a>02969     tree_table  TEXT,
 <a name="l02970"></a>02970     tree_id     INT[],
 <a name="l02971"></a>02971     max_depth   INT
 <a name="l02972"></a>02972     )
 <a name="l02973"></a>02973 RETURNS SETOF TEXT AS $$
 <a name="l02974"></a>02974 DECLARE
 <a name="l02975"></a>02975     metatable_name  TEXT := null;
 <a name="l02976"></a>02976     curstmt         TEXT := &#39;&#39;;
 <a name="l02977"></a>02977     index           INT;
 <a name="l02978"></a>02978     result          TEXT := &#39;&#39;;
 <a name="l02979"></a>02979     root_id         INT;
 <a name="l02980"></a>02980 BEGIN
 <a name="l02981"></a>02981     PERFORM MADLIB_SCHEMA.__assert_table
 <a name="l02982"></a>02982             (
 <a name="l02983"></a>02983                 tree_table,
 <a name="l02984"></a>02984                 &#39;t&#39;
 <a name="l02985"></a>02985             );
 <a name="l02986"></a>02986
 <a name="l02987"></a>02987     metatable_name = MADLIB_SCHEMA.__get_metatable_name( tree_table );
 <a name="l02988"></a>02988
 <a name="l02989"></a>02989     index= array_lower(tree_id,1);
 <a name="l02990"></a>02990
 <a name="l02991"></a>02991     WHILE (index&lt;=array_upper(tree_id,1) ) LOOP
 <a name="l02992"></a>02992         EXECUTE &#39;SELECT id FROM &#39;||tree_table||
 <a name="l02993"></a>02993             &#39; WHERE parent_id=0 and tid=&#39;||tree_id[index]||&#39;;&#39; INTO root_id;
 <a name="l02994"></a>02994
 <a name="l02995"></a>02995         RETURN NEXT E&#39;\nTree &#39;||tree_id[index]||E&#39;\n&#39;||
 <a name="l02996"></a>02996             MADLIB_SCHEMA.__display_tree_no_ordered_aggr(tree_table, root_id, 0, 0, &#39;f&#39;,
 <a name="l02997"></a>02997             0, metatable_name,max_depth,tree_id[index]);
 <a name="l02998"></a>02998         index=index+1;
 <a name="l02999"></a>02999     END LOOP;
 <a name="l03000"></a>03000     RETURN;
 <a name="l03001"></a>03001 END $$ LANGUAGE PLPGSQL;
 <a name="l03002"></a>03002
 <a name="l03003"></a>03003
 <a name="l03004"></a>03004 <span class="comment">/*</span>
 <a name="l03005"></a>03005 <span class="comment"> * @brief Multiple trees may classify the same record to different classes. </span>
 <a name="l03006"></a>03006 <span class="comment"> *        This function gets the results voted by multiple trees.</span>
 <a name="l03007"></a>03007 <span class="comment"> *</span>
 <a name="l03008"></a>03008 <span class="comment"> * @param src_table    The full name of the table containing original data.</span>
 <a name="l03009"></a>03009 <span class="comment"> * @param dst_table    The full name of the table to store the voted results. </span>
 <a name="l03010"></a>03010 <span class="comment"> *</span>
 <a name="l03011"></a>03011 <span class="comment"> */</span>
 <a name="l03012"></a>03012 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_get_vote_result
 <a name="l03013"></a>03013     (
 <a name="l03014"></a>03014     src_table   TEXT,
 <a name="l03015"></a>03015     dst_table   TEXT
 <a name="l03016"></a>03016     )
 <a name="l03017"></a>03017 RETURNS VOID AS $$
 <a name="l03018"></a>03018 DECLARE
 <a name="l03019"></a>03019     curstmt TEXT;
 <a name="l03020"></a>03020 BEGIN
 <a name="l03021"></a>03021     EXECUTE &#39;DROP TABLE IF EXISTS &#39;||dst_table;
 <a name="l03022"></a>03022     EXECUTE &#39;CREATE TEMP TABLE &#39;||dst_table||E&#39;
 <a name="l03023"></a>03023     (
 <a name="l03024"></a>03024         id          BIGINT,
 <a name="l03025"></a>03025         class       INT,
 <a name="l03026"></a>03026         prob        FLOAT8
 <a name="l03027"></a>03027     )m4_ifdef(`__GREENPLUM__&#39;, `DISTRIBUTED BY (id)&#39;);&#39;;
 <a name="l03028"></a>03028
 <a name="l03029"></a>03029     SELECT MADLIB_SCHEMA.__format(
 <a name="l03030"></a>03030         &#39;INSERT INTO %
 <a name="l03031"></a>03031         SELECT id, max_array[3], max_array[2] FROM
 <a name="l03032"></a>03032             (SELECT id, max(array[count,prob,class]) AS max_array FROM
 <a name="l03033"></a>03033                 (SELECT id, class, COUNT(*) AS count, AVG(prob) as prob FROM
 <a name="l03034"></a>03034                     % GROUP BY id,class) t1 GROUP BY id) t2&#39;,
 <a name="l03035"></a>03035         ARRAY[
 <a name="l03036"></a>03036             dst_table,
 <a name="l03037"></a>03037             src_table
 <a name="l03038"></a>03038         ]
 <a name="l03039"></a>03039         )
 <a name="l03040"></a>03040     INTO curstmt;
 <a name="l03041"></a>03041     EXECUTE curstmt;
 <a name="l03042"></a>03042     RETURN;
 <a name="l03043"></a>03043 END
 <a name="l03044"></a>03044 $$ LANGUAGE PLPGSQL;
 <a name="l03045"></a>03045
 <a name="l03046"></a>03046
 <a name="l03047"></a>03047 <span class="comment">/*</span>
 <a name="l03048"></a>03048 <span class="comment"> * @brief An internal classification function. It classifies with all trees at </span>
 <a name="l03049"></a>03049 <span class="comment"> *        the same time. For medium/small data sets, tests shows that it is more</span>
 <a name="l03050"></a>03050 <span class="comment"> *        efficient than the serial classification function. </span>
 <a name="l03051"></a>03051 <span class="comment"> *</span>
 <a name="l03052"></a>03052 <span class="comment"> * @param classification_table_name  The full name of the table containing the </span>
 <a name="l03053"></a>03053 <span class="comment"> *                                   classification set.</span>
 <a name="l03054"></a>03054 <span class="comment"> * @param tree_table_name            The full name of the tree table.</span>
 <a name="l03055"></a>03055 <span class="comment"> * @param verbosity                  &gt; 0 means this function runs in verbose mode. </span>
 <a name="l03056"></a>03056 <span class="comment"> *</span>
 <a name="l03057"></a>03057 <span class="comment"> * @return An array containing the encoded table name and classification result </span>
 <a name="l03058"></a>03058 <span class="comment"> *         table name (We encode the source table during the classification).</span>
 <a name="l03059"></a>03059 <span class="comment"> *</span>
 <a name="l03060"></a>03060 <span class="comment"> */</span>
 <a name="l03061"></a>03061 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_classify_internal
 <a name="l03062"></a>03062     (
 <a name="l03063"></a>03063     classification_table_name   TEXT,
 <a name="l03064"></a>03064     tree_table_name             TEXT,
 <a name="l03065"></a>03065     verbosity                   INT
 <a name="l03066"></a>03066     )
 <a name="l03067"></a>03067 RETURNS TEXT[] AS $$
 <a name="l03068"></a>03068 DECLARE
 <a name="l03069"></a>03069     table_pick              INT    := 1;
 <a name="l03070"></a>03070     remains_to_classify     INT;
 <a name="l03071"></a>03071     size_finished           INT;
 <a name="l03072"></a>03072     time_stamp              TIMESTAMP;
 <a name="l03073"></a>03073     metatable_name          TEXT   := &#39;&#39;;
 <a name="l03074"></a>03074     id_col_name             TEXT   := &#39;id&#39;;
 <a name="l03075"></a>03075     curr_level              INT    := 1;
 <a name="l03076"></a>03076     max_level               INT    := 0;
 <a name="l03077"></a>03077     h2hmv_routine_id        INT    := 0;
 <a name="l03078"></a>03078     curstmt                 TEXT   := &#39;&#39;;
 <a name="l03079"></a>03079     result_table_name       TEXT   := &#39;dt_classify_internal_rt&#39;;
 <a name="l03080"></a>03080     encoded_table_name      TEXT   := &#39;dt_classify_internal_edt&#39;;
 <a name="l03081"></a>03081     table_names             TEXT[] := &#39;{classified_instance_ping,classified_instance_pong}&#39;;
 <a name="l03082"></a>03082     tree_id                 INT;
 <a name="l03083"></a>03083 BEGIN
 <a name="l03084"></a>03084     time_stamp = clock_timestamp();
 <a name="l03085"></a>03085
 <a name="l03086"></a>03086     PERFORM MADLIB_SCHEMA.__assert
 <a name="l03087"></a>03087             (
 <a name="l03088"></a>03088                 (classification_table_name IS NOT NULL) AND
 <a name="l03089"></a>03089                 (
 <a name="l03090"></a>03090                  MADLIB_SCHEMA.__table_exists
 <a name="l03091"></a>03091                     (
 <a name="l03092"></a>03092                         classification_table_name
 <a name="l03093"></a>03093                     )
 <a name="l03094"></a>03094                 ),
 <a name="l03095"></a>03095                 &#39;the specified classification table&#39; ||
 <a name="l03096"></a>03096                 coalesce(&#39;&lt;&#39; || classification_table_name ||
 <a name="l03097"></a>03097                 &#39;&gt; does not exists&#39;, &#39; is NULL&#39;)
 <a name="l03098"></a>03098             );
 <a name="l03099"></a>03099
 <a name="l03100"></a>03100     PERFORM MADLIB_SCHEMA.__assert
 <a name="l03101"></a>03101             (
 <a name="l03102"></a>03102                 (tree_table_name IS NOT NULL) AND
 <a name="l03103"></a>03103                 (
 <a name="l03104"></a>03104                  MADLIB_SCHEMA.__table_exists
 <a name="l03105"></a>03105                     (
 <a name="l03106"></a>03106                         tree_table_name
 <a name="l03107"></a>03107                     )
 <a name="l03108"></a>03108                 ),
 <a name="l03109"></a>03109                 &#39;the specified tree table&#39; ||
 <a name="l03110"></a>03110                 coalesce(&#39;&lt;&#39; || tree_table_name || &#39;&gt; does not exists&#39;, &#39; is NULL&#39;)
 <a name="l03111"></a>03111             );
 <a name="l03112"></a>03112
 <a name="l03113"></a>03113     PERFORM MADLIB_SCHEMA.__assert
 <a name="l03114"></a>03114             (
 <a name="l03115"></a>03115                 verbosity IS NOT NULL,
 <a name="l03116"></a>03116                 &#39;verbosity must be non-null&#39;
 <a name="l03117"></a>03117             );
 <a name="l03118"></a>03118
 <a name="l03119"></a>03119     EXECUTE &#39;DROP TABLE IF EXISTS &#39; || encoded_table_name || &#39; CASCADE&#39;;
 <a name="l03120"></a>03120
 <a name="l03121"></a>03121     SELECT MADLIB_SCHEMA.__get_metatable_name(tree_table_name) INTO metatable_name;
 <a name="l03122"></a>03122
 <a name="l03123"></a>03123     SELECT MADLIB_SCHEMA.__get_routine_id(tree_table_name) INTO h2hmv_routine_id;
 <a name="l03124"></a>03124
 <a name="l03125"></a>03125     PERFORM MADLIB_SCHEMA.__encode_table
 <a name="l03126"></a>03126         (
 <a name="l03127"></a>03127             classification_table_name,
 <a name="l03128"></a>03128             encoded_table_name,
 <a name="l03129"></a>03129             metatable_name,
 <a name="l03130"></a>03130             h2hmv_routine_id,
 <a name="l03131"></a>03131             verbosity
 <a name="l03132"></a>03132         );
 <a name="l03133"></a>03133
 <a name="l03134"></a>03134     IF (verbosity &gt; 0) THEN
 <a name="l03135"></a>03135         RAISE INFO &#39;tabular format. id_col_name: %&#39;, id_col_name;
 <a name="l03136"></a>03136     END IF;
 <a name="l03137"></a>03137
 <a name="l03138"></a>03138     <span class="comment">/*</span>
 <a name="l03139"></a>03139 <span class="comment">     *  The table of classified_instance_ping and classified_instance_pong are</span>
 <a name="l03140"></a>03140 <span class="comment">     *  auxiliary tables used during the classification process.</span>
 <a name="l03141"></a>03141 <span class="comment">     *  For each record, these tables tell us which node it belongs to. They also</span>
 <a name="l03142"></a>03142 <span class="comment">     *  hold the information of class and probability.</span>
 <a name="l03143"></a>03143 <span class="comment">     *  We use transfer data between these two tables rather than update a single</span>
 <a name="l03144"></a>03144 <span class="comment">     *  table during the classification process. We find the operation of update</span>
 <a name="l03145"></a>03145 <span class="comment">     *  is quite expensive.</span>
 <a name="l03146"></a>03146 <span class="comment">     */</span>
 <a name="l03147"></a>03147     DROP TABLE IF EXISTS classified_instance_ping;
 <a name="l03148"></a>03148     CREATE TEMP TABLE classified_instance_ping
 <a name="l03149"></a>03149     (
 <a name="l03150"></a>03150         tid         INT,
 <a name="l03151"></a>03151         id          BIGINT,
 <a name="l03152"></a>03152         jump        INT,
 <a name="l03153"></a>03153         class       INT,
 <a name="l03154"></a>03154         prob        FLOAT,
 <a name="l03155"></a>03155         parent_id   INT,
 <a name="l03156"></a>03156         leaf_id     INT
 <a name="l03157"></a>03157     ) m4_ifdef(`__GREENPLUM__&#39;, `DISTRIBUTED BY (id)&#39;);
 <a name="l03158"></a>03158
 <a name="l03159"></a>03159     DROP TABLE IF EXISTS classified_instance_pong;
 <a name="l03160"></a>03160     CREATE TEMP TABLE classified_instance_pong
 <a name="l03161"></a>03161     (
 <a name="l03162"></a>03162         tid         INT,
 <a name="l03163"></a>03163         id          BIGINT,
 <a name="l03164"></a>03164         jump        INT,
 <a name="l03165"></a>03165         class       INT,
 <a name="l03166"></a>03166         prob        FLOAT,
 <a name="l03167"></a>03167         parent_id   INT,
 <a name="l03168"></a>03168         leaf_id     INT
 <a name="l03169"></a>03169     ) m4_ifdef(`__GREENPLUM__&#39;, `DISTRIBUTED BY (id)&#39;);
 <a name="l03170"></a>03170
 <a name="l03171"></a>03171
 <a name="l03172"></a>03172     EXECUTE &#39;DROP TABLE IF EXISTS &#39; || result_table_name || &#39; CASCADE&#39;;
 <a name="l03173"></a>03173     EXECUTE &#39;CREATE TEMP TABLE &#39; || result_table_name || E&#39;
 <a name="l03174"></a>03174     (
 <a name="l03175"></a>03175         tid         INT,
 <a name="l03176"></a>03176         id          BIGINT,
 <a name="l03177"></a>03177         jump        INT,
 <a name="l03178"></a>03178         class       INT,
 <a name="l03179"></a>03179         prob        FLOAT,
 <a name="l03180"></a>03180         parent_id   INT,
 <a name="l03181"></a>03181         leaf_id     INT
 <a name="l03182"></a>03182     ) m4_ifdef(`__GREENPLUM__&#39;, `DISTRIBUTED BY (id)&#39;);&#39;;
 <a name="l03183"></a>03183
 <a name="l03184"></a>03184
 <a name="l03185"></a>03185     EXECUTE &#39;INSERT INTO classified_instance_ping (id, jump, class, prob,tid)
 <a name="l03186"></a>03186         SELECT m.&#39;||id_col_name||&#39;, t.id, 0, 0, t.tid
 <a name="l03187"></a>03187         FROM &#39; || encoded_table_name || &#39; m CROSS JOIN
 <a name="l03188"></a>03188         (SELECT DISTINCT tid,id FROM &#39;||tree_table_name||&#39; WHERE parent_id=0) t;&#39;;
 <a name="l03189"></a>03189
 <a name="l03190"></a>03190
 <a name="l03191"></a>03191     EXECUTE &#39;SELECT max(array_upper(tree_location,1)) FROM &#39;||tree_table_name||&#39;;&#39;
 <a name="l03192"></a>03192         INTO max_level;
 <a name="l03193"></a>03193
 <a name="l03194"></a>03194     IF( max_level is NULL ) THEN
 <a name="l03195"></a>03195         RAISE EXCEPTION &#39;tree should not be empty&#39;;
 <a name="l03196"></a>03196     END IF;
 <a name="l03197"></a>03197
 <a name="l03198"></a>03198     FOR curr_level IN 1..max_level LOOP
 <a name="l03199"></a>03199         IF (verbosity &gt; 0) THEN
 <a name="l03200"></a>03200             RAISE INFO &#39;new_depth: %&#39;, curr_level;
 <a name="l03201"></a>03201         END IF;
 <a name="l03202"></a>03202
 <a name="l03203"></a>03203         table_pick = table_pick % 2 + 1;
 <a name="l03204"></a>03204
 <a name="l03205"></a>03205         EXECUTE &#39;TRUNCATE &#39;|| table_names[table_pick] ||&#39;;&#39;;
 <a name="l03206"></a>03206         EXECUTE &#39;SELECT count(id) FROM &#39;||result_table_name||&#39;;&#39; INTO size_finished;
 <a name="l03207"></a>03207
 <a name="l03208"></a>03208         IF (verbosity &gt; 0) THEN
 <a name="l03209"></a>03209             RAISE INFO &#39;size_finished %&#39;, size_finished;
 <a name="l03210"></a>03210         END IF;
 <a name="l03211"></a>03211
 <a name="l03212"></a>03212         EXECUTE &#39;SELECT count(*) FROM &#39;|| table_names[(table_pick) % 2 + 1] ||&#39;;&#39;
 <a name="l03213"></a>03213             INTO remains_to_classify;
 <a name="l03214"></a>03214
 <a name="l03215"></a>03215         IF (remains_to_classify = 0) THEN
 <a name="l03216"></a>03216             IF (verbosity &gt; 0) THEN
 <a name="l03217"></a>03217                 RAISE INFO &#39;size_finished: % remains_to_classify: %&#39;,
 <a name="l03218"></a>03218                     size_finished, remains_to_classify;
 <a name="l03219"></a>03219             END IF;
 <a name="l03220"></a>03220
 <a name="l03221"></a>03221             EXIT;
 <a name="l03222"></a>03222         END IF;
 <a name="l03223"></a>03223
 <a name="l03224"></a>03224         SELECT MADLIB_SCHEMA.__format(
 <a name="l03225"></a>03225             &#39;INSERT INTO %
 <a name="l03226"></a>03226             SELECT pt.tid, pt.id,
 <a name="l03227"></a>03227             CASE WHEN (is_cont) THEN
 <a name="l03228"></a>03228                     CASE WHEN (gt.lmc_nid IS NULL) THEN
 <a name="l03229"></a>03229                         0
 <a name="l03230"></a>03230                     ELSE
 <a name="l03231"></a>03231                         gt.lmc_nid +
 <a name="l03232"></a>03232                         float8lt(gt.split_value, fvals[gt.feature])::INT4 + 1 -
 <a name="l03233"></a>03233                         gt.lmc_fval
 <a name="l03234"></a>03234                     END
 <a name="l03235"></a>03235                 ELSE
 <a name="l03236"></a>03236                     CASE WHEN (gt.lmc_nid IS NULL) THEN
 <a name="l03237"></a>03237                         0
 <a name="l03238"></a>03238                     ELSE
 <a name="l03239"></a>03239                         gt.lmc_nid + fvals[gt.feature] - gt.lmc_fval
 <a name="l03240"></a>03240                     END
 <a name="l03241"></a>03241                 END as newjump,
 <a name="l03242"></a>03242             gt.max_class, gt.probability, gt.parent_id, gt.id
 <a name="l03243"></a>03243             FROM
 <a name="l03244"></a>03244             (SELECT t1.tid, t1.id, t1.jump, fvals
 <a name="l03245"></a>03245                 FROM % t1
 <a name="l03246"></a>03246                 LEFT JOIN % t2
 <a name="l03247"></a>03247                 ON t1.id = t2.id)  AS pt,
 <a name="l03248"></a>03248             (SELECT tid,lmc_nid, lmc_fval, max_class,feature, probability,
 <a name="l03249"></a>03249                     parent_id, id, is_cont, split_value
 <a name="l03250"></a>03250                 FROM %
 <a name="l03251"></a>03251                 WHERE array_upper(tree_location,1) = %) AS gt
 <a name="l03252"></a>03252             WHERE pt.jump = gt.id AND pt.tid=gt.tid;&#39;,
 <a name="l03253"></a>03253             ARRAY[
 <a name="l03254"></a>03254                 table_names[table_pick],
 <a name="l03255"></a>03255                 table_names[(table_pick) % 2 + 1],
 <a name="l03256"></a>03256                 encoded_table_name,
 <a name="l03257"></a>03257                 tree_table_name,
 <a name="l03258"></a>03258                 MADLIB_SCHEMA.__to_char(curr_level)
 <a name="l03259"></a>03259             ]
 <a name="l03260"></a>03260             )
 <a name="l03261"></a>03261         INTO curstmt;
 <a name="l03262"></a>03262         EXECUTE curstmt;
 <a name="l03263"></a>03263         <span class="comment">/*</span>
 <a name="l03264"></a>03264 <span class="comment">         *  if the node (whose id is &quot;jump&quot;) doesn&#39;t exist, </span>
 <a name="l03265"></a>03265 <span class="comment">         *  then insert them into result table </span>
 <a name="l03266"></a>03266 <span class="comment">         *  (be classified to max_class of its corrsponding node)</span>
 <a name="l03267"></a>03267 <span class="comment">         */</span>
 <a name="l03268"></a>03268         FOR tree_id IN EXECUTE &#39;SELECT DISTINCT tid FROM &#39;||tree_table_name LOOP
 <a name="l03269"></a>03269             SELECT MADLIB_SCHEMA.__format(
 <a name="l03270"></a>03270                 &#39;INSERT INTO %(tid,id, jump, class, prob, parent_id, leaf_id)
 <a name="l03271"></a>03271                 SELECT tid,id, 0, class, prob, parent_id, leaf_id
 <a name="l03272"></a>03272                 FROM %
 <a name="l03273"></a>03273                 WHERE jump NOT IN (SELECT id FROM % WHERE tid=%)
 <a name="l03274"></a>03274                 AND tid=%&#39;,
 <a name="l03275"></a>03275                 ARRAY[
 <a name="l03276"></a>03276                     result_table_name,
 <a name="l03277"></a>03277                     table_names[table_pick],
 <a name="l03278"></a>03278                     tree_table_name,
 <a name="l03279"></a>03279                     MADLIB_SCHEMA.__to_char(tree_id),
 <a name="l03280"></a>03280                     MADLIB_SCHEMA.__to_char(tree_id)
 <a name="l03281"></a>03281                 ]
 <a name="l03282"></a>03282                 )
 <a name="l03283"></a>03283             INTO curstmt;
 <a name="l03284"></a>03284             EXECUTE curstmt;
 <a name="l03285"></a>03285
 <a name="l03286"></a>03286             -- delete from the being classified data table
 <a name="l03287"></a>03287             SELECT MADLIB_SCHEMA.__format(
 <a name="l03288"></a>03288                 &#39;DELETE FROM %
 <a name="l03289"></a>03289                 WHERE jump NOT IN (SELECT id FROM % WHERE tid=%)
 <a name="l03290"></a>03290                 AND tid=%&#39;,
 <a name="l03291"></a>03291                 ARRAY[
 <a name="l03292"></a>03292                     table_names[table_pick],
 <a name="l03293"></a>03293                     tree_table_name,
 <a name="l03294"></a>03294                     MADLIB_SCHEMA.__to_char(tree_id),
 <a name="l03295"></a>03295                     MADLIB_SCHEMA.__to_char(tree_id)
 <a name="l03296"></a>03296                 ]
 <a name="l03297"></a>03297                 )
 <a name="l03298"></a>03298             INTO curstmt;
 <a name="l03299"></a>03299             EXECUTE curstmt;
 <a name="l03300"></a>03300         END LOOP;
 <a name="l03301"></a>03301     END LOOP;
 <a name="l03302"></a>03302
 <a name="l03303"></a>03303     EXECUTE &#39;INSERT INTO &#39;||result_table_name||&#39; SELECT * FROM &#39;||
 <a name="l03304"></a>03304         table_names[table_pick] ||&#39; WHERE jump = 0;&#39;;
 <a name="l03305"></a>03305     EXECUTE &#39;INSERT INTO &#39;||result_table_name||&#39; SELECT * FROM &#39;||
 <a name="l03306"></a>03306         table_names[table_pick % 2 + 1] ||&#39; WHERE jump = 0;&#39;;
 <a name="l03307"></a>03307
 <a name="l03308"></a>03308     IF (verbosity &gt; 0) THEN
 <a name="l03309"></a>03309         RAISE INFO &#39;final classification time:%&#39;, clock_timestamp() - time_stamp;
 <a name="l03310"></a>03310     END IF;
 <a name="l03311"></a>03311
 <a name="l03312"></a>03312     RETURN ARRAY[encoded_table_name, result_table_name];
 <a name="l03313"></a>03313 END
 <a name="l03314"></a>03314 $$ LANGUAGE PLPGSQL;
 <a name="l03315"></a>03315
 <a name="l03316"></a>03316
 <a name="l03317"></a>03317 <span class="comment">/*</span>
 <a name="l03318"></a>03318 <span class="comment"> * @brief An internal classification function. It classifies with one tree </span>
 <a name="l03319"></a>03319 <span class="comment"> *        after another. For large data sets, tests shows that it is more</span>
 <a name="l03320"></a>03320 <span class="comment"> *        efficient than the parallel classification function.   </span>
 <a name="l03321"></a>03321 <span class="comment"> *</span>
 <a name="l03322"></a>03322 <span class="comment"> * @param classification_table_name  The full name of the table containing the </span>
 <a name="l03323"></a>03323 <span class="comment"> *                                   classification set.</span>
 <a name="l03324"></a>03324 <span class="comment"> * @param tree_table_name            The full name of the tree table.</span>
 <a name="l03325"></a>03325 <span class="comment"> * @param verbosity                  &gt; 0 means this function runs in verbose mode. </span>
 <a name="l03326"></a>03326 <span class="comment"> *</span>
 <a name="l03327"></a>03327 <span class="comment"> * @return An array containing the encoded table name and classification result </span>
 <a name="l03328"></a>03328 <span class="comment"> *         table name (We encode the source table during the classification).</span>
 <a name="l03329"></a>03329 <span class="comment"> *</span>
 <a name="l03330"></a>03330 <span class="comment"> */</span>
 <a name="l03331"></a>03331 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_classify_internal_serial
 <a name="l03332"></a>03332     (
 <a name="l03333"></a>03333     classification_table_name   TEXT,
 <a name="l03334"></a>03334     tree_table_name             TEXT,
 <a name="l03335"></a>03335     verbosity                   INT
 <a name="l03336"></a>03336     )
 <a name="l03337"></a>03337 RETURNS TEXT[] AS $$
 <a name="l03338"></a>03338 DECLARE
 <a name="l03339"></a>03339     table_pick              INT    := 1;
 <a name="l03340"></a>03340     remains_to_classify     INT;
 <a name="l03341"></a>03341     size_finished           INT;
 <a name="l03342"></a>03342     time_stamp              TIMESTAMP;
 <a name="l03343"></a>03343     metatable_name          TEXT   := &#39;&#39;;
 <a name="l03344"></a>03344     id_col_name             TEXT   := &#39;id&#39;;
 <a name="l03345"></a>03345     curr_level              INT    := 1;
 <a name="l03346"></a>03346     max_level               INT    := 0;
 <a name="l03347"></a>03347     h2hmv_routine_id        INT    := 0;
 <a name="l03348"></a>03348     curstmt                 TEXT   := &#39;&#39;;
 <a name="l03349"></a>03349     result_table_name       TEXT   := &#39;dt_classify_internal_rt&#39;;
 <a name="l03350"></a>03350     encoded_table_name      TEXT   := &#39;dt_classify_internal_edt&#39;;
 <a name="l03351"></a>03351     table_names             TEXT[] := ARRAY[
 <a name="l03352"></a>03352                                         &#39;classified_instance_ping&#39;,
 <a name="l03353"></a>03353                                         &#39;classified_instance_pong&#39;
 <a name="l03354"></a>03354                                         ];
 <a name="l03355"></a>03355     tree_id                 INT;
 <a name="l03356"></a>03356     root_id                 INT;
 <a name="l03357"></a>03357 BEGIN
 <a name="l03358"></a>03358     time_stamp = clock_timestamp();
 <a name="l03359"></a>03359
 <a name="l03360"></a>03360     PERFORM MADLIB_SCHEMA.__assert
 <a name="l03361"></a>03361             (
 <a name="l03362"></a>03362                 (classification_table_name IS NOT NULL) AND
 <a name="l03363"></a>03363                 (
 <a name="l03364"></a>03364                  MADLIB_SCHEMA.__table_exists
 <a name="l03365"></a>03365                     (
 <a name="l03366"></a>03366                         classification_table_name
 <a name="l03367"></a>03367                     )
 <a name="l03368"></a>03368                 ),
 <a name="l03369"></a>03369                 &#39;the specified classification table&#39; ||
 <a name="l03370"></a>03370                 coalesce(&#39;&lt;&#39;                         ||
 <a name="l03371"></a>03371                 classification_table_name            ||
 <a name="l03372"></a>03372                 &#39;&gt; does not exists&#39;, &#39; is NULL&#39;)
 <a name="l03373"></a>03373             );
 <a name="l03374"></a>03374
 <a name="l03375"></a>03375     PERFORM MADLIB_SCHEMA.__assert
 <a name="l03376"></a>03376             (
 <a name="l03377"></a>03377                 (tree_table_name IS NOT NULL) AND
 <a name="l03378"></a>03378                 (
 <a name="l03379"></a>03379                  MADLIB_SCHEMA.__table_exists
 <a name="l03380"></a>03380                     (
 <a name="l03381"></a>03381                         tree_table_name
 <a name="l03382"></a>03382                     )
 <a name="l03383"></a>03383                 ),
 <a name="l03384"></a>03384                 &#39;the specified tree table&#39;  ||
 <a name="l03385"></a>03385                 coalesce(&#39;&lt;&#39;                ||
 <a name="l03386"></a>03386                 tree_table_name             ||
 <a name="l03387"></a>03387                 &#39;&gt; does not exists&#39;, &#39; is NULL&#39;)
 <a name="l03388"></a>03388             );
 <a name="l03389"></a>03389
 <a name="l03390"></a>03390
 <a name="l03391"></a>03391     PERFORM MADLIB_SCHEMA.__assert
 <a name="l03392"></a>03392             (
 <a name="l03393"></a>03393                 verbosity IS NOT NULL,
 <a name="l03394"></a>03394                 &#39;verbosity must be non-null&#39;
 <a name="l03395"></a>03395             );
 <a name="l03396"></a>03396
 <a name="l03397"></a>03397     EXECUTE &#39;DROP TABLE IF EXISTS &#39; || encoded_table_name || &#39; CASCADE&#39;;
 <a name="l03398"></a>03398
 <a name="l03399"></a>03399     metatable_name   = MADLIB_SCHEMA.__get_metatable_name(tree_table_name);
 <a name="l03400"></a>03400
 <a name="l03401"></a>03401     h2hmv_routine_id = MADLIB_SCHEMA.__get_routine_id(tree_table_name);
 <a name="l03402"></a>03402
 <a name="l03403"></a>03403     PERFORM MADLIB_SCHEMA.__encode_table
 <a name="l03404"></a>03404         (
 <a name="l03405"></a>03405             classification_table_name,
 <a name="l03406"></a>03406             encoded_table_name,
 <a name="l03407"></a>03407             metatable_name,
 <a name="l03408"></a>03408             h2hmv_routine_id,
 <a name="l03409"></a>03409             verbosity
 <a name="l03410"></a>03410         );
 <a name="l03411"></a>03411
 <a name="l03412"></a>03412     IF (verbosity &gt; 0) THEN
 <a name="l03413"></a>03413         RAISE INFO &#39;tabular format. id_col_name: %&#39;, id_col_name;
 <a name="l03414"></a>03414     END IF;
 <a name="l03415"></a>03415
 <a name="l03416"></a>03416     <span class="comment">/*</span>
 <a name="l03417"></a>03417 <span class="comment">     *  The table of classified_instance_ping and classified_instance_pong are</span>
 <a name="l03418"></a>03418 <span class="comment">     *  auxiliary tables used during the classification process.</span>
 <a name="l03419"></a>03419 <span class="comment">     *  For each record, these tables tell us which node it belongs to. They also</span>
 <a name="l03420"></a>03420 <span class="comment">     *  hold the information of class and probability.</span>
 <a name="l03421"></a>03421 <span class="comment">     *  We use transfer data between these two tables rather than update a single</span>
 <a name="l03422"></a>03422 <span class="comment">     *  table during the classification process. We find the operation of update</span>
 <a name="l03423"></a>03423 <span class="comment">     *  is quite expensive.</span>
 <a name="l03424"></a>03424 <span class="comment">     */</span>
 <a name="l03425"></a>03425     DROP TABLE IF EXISTS classified_instance_ping;
 <a name="l03426"></a>03426     CREATE TEMP TABLE classified_instance_ping
 <a name="l03427"></a>03427     (
 <a name="l03428"></a>03428         id          BIGINT,
 <a name="l03429"></a>03429         jump        INT,
 <a name="l03430"></a>03430         class       INT,
 <a name="l03431"></a>03431         prob        FLOAT,
 <a name="l03432"></a>03432         parent_id   INT,
 <a name="l03433"></a>03433         leaf_id     INT
 <a name="l03434"></a>03434     ) m4_ifdef(`__GREENPLUM__&#39;, `DISTRIBUTED BY (id)&#39;);
 <a name="l03435"></a>03435
 <a name="l03436"></a>03436     DROP TABLE IF EXISTS classified_instance_pong;
 <a name="l03437"></a>03437     CREATE TEMP TABLE classified_instance_pong
 <a name="l03438"></a>03438     (
 <a name="l03439"></a>03439         id          BIGINT,
 <a name="l03440"></a>03440         jump        INT,
 <a name="l03441"></a>03441         class       INT,
 <a name="l03442"></a>03442         prob        FLOAT,
 <a name="l03443"></a>03443         parent_id   INT,
 <a name="l03444"></a>03444         leaf_id     INT
 <a name="l03445"></a>03445     ) m4_ifdef(`__GREENPLUM__&#39;, `DISTRIBUTED BY (id)&#39;);
 <a name="l03446"></a>03446
 <a name="l03447"></a>03447
 <a name="l03448"></a>03448     EXECUTE &#39;DROP TABLE IF EXISTS &#39;||result_table_name || &#39; CASCADE&#39;;
 <a name="l03449"></a>03449     EXECUTE &#39;CREATE TEMP TABLE &#39; || result_table_name || E&#39;
 <a name="l03450"></a>03450     (
 <a name="l03451"></a>03451         tid         INT,
 <a name="l03452"></a>03452         id          BIGINT,
 <a name="l03453"></a>03453         jump        INT,
 <a name="l03454"></a>03454         class       INT,
 <a name="l03455"></a>03455         prob        FLOAT,
 <a name="l03456"></a>03456         parent_id   INT,
 <a name="l03457"></a>03457         leaf_id     INT
 <a name="l03458"></a>03458     ) m4_ifdef(`__GREENPLUM__&#39;, `DISTRIBUTED BY (id)&#39;);&#39;;
 <a name="l03459"></a>03459
 <a name="l03460"></a>03460     FOR tree_id IN EXECUTE &#39;SELECT DISTINCT tid FROM &#39;||tree_table_name LOOP
 <a name="l03461"></a>03461         EXECUTE &#39;SELECT max(array_upper(tree_location,1)) FROM &#39;||
 <a name="l03462"></a>03462             tree_table_name||&#39; WHERE tid=&#39;||tree_id||&#39;;&#39;  INTO max_level;
 <a name="l03463"></a>03463         IF (verbosity &gt; 0) THEN
 <a name="l03464"></a>03464             RAISE INFO &#39;tree_id: %, max_level: %&#39;, tree_id,max_level;
 <a name="l03465"></a>03465         END IF;
 <a name="l03466"></a>03466
 <a name="l03467"></a>03467
 <a name="l03468"></a>03468         IF( max_level is NULL ) THEN
 <a name="l03469"></a>03469             RAISE EXCEPTION &#39;tree should not be empty&#39;;
 <a name="l03470"></a>03470         END IF;
 <a name="l03471"></a>03471
 <a name="l03472"></a>03472         TRUNCATE classified_instance_ping;
 <a name="l03473"></a>03473         TRUNCATE classified_instance_pong;
 <a name="l03474"></a>03474
 <a name="l03475"></a>03475         EXECUTE &#39;SELECT id FROM &#39;||tree_table_name||
 <a name="l03476"></a>03476             &#39; WHERE parent_id=0 and tid=&#39;||tree_id||&#39;;&#39; INTO root_id;
 <a name="l03477"></a>03477         EXECUTE &#39;INSERT INTO classified_instance_ping (id, jump, class, prob)
 <a name="l03478"></a>03478                  SELECT &#39;||id_col_name||&#39;, &#39;||root_id||&#39;, 0, 0 FROM &#39; ||
 <a name="l03479"></a>03479                  encoded_table_name || &#39;;&#39;;
 <a name="l03480"></a>03480         table_pick= 1;
 <a name="l03481"></a>03481         FOR curr_level IN 1..max_level LOOP
 <a name="l03482"></a>03482             IF (verbosity &gt; 0) THEN
 <a name="l03483"></a>03483                 RAISE INFO &#39;new_depth: %&#39;, curr_level;
 <a name="l03484"></a>03484             END IF;
 <a name="l03485"></a>03485
 <a name="l03486"></a>03486             table_pick = table_pick % 2 + 1;
 <a name="l03487"></a>03487
 <a name="l03488"></a>03488             EXECUTE &#39;TRUNCATE &#39;|| table_names[table_pick] ||&#39;;&#39;;
 <a name="l03489"></a>03489             EXECUTE &#39;SELECT count(id) FROM &#39;||result_table_name||&#39;;&#39;
 <a name="l03490"></a>03490                      INTO size_finished;
 <a name="l03491"></a>03491
 <a name="l03492"></a>03492             IF (verbosity &gt; 0) THEN
 <a name="l03493"></a>03493                 RAISE INFO &#39;size_finished %&#39;, size_finished;
 <a name="l03494"></a>03494             END IF;
 <a name="l03495"></a>03495
 <a name="l03496"></a>03496             EXECUTE &#39;SELECT count(*) FROM &#39;||
 <a name="l03497"></a>03497                      table_names[(table_pick) % 2 + 1] ||&#39;;&#39;
 <a name="l03498"></a>03498                      INTO remains_to_classify;
 <a name="l03499"></a>03499
 <a name="l03500"></a>03500             IF (remains_to_classify = 0) THEN
 <a name="l03501"></a>03501                 IF (verbosity &gt; 0) THEN
 <a name="l03502"></a>03502                     RAISE INFO &#39;size_finished: % remains_to_classify: %&#39;,
 <a name="l03503"></a>03503                         size_finished, remains_to_classify;
 <a name="l03504"></a>03504                 END IF;
 <a name="l03505"></a>03505
 <a name="l03506"></a>03506                 EXIT;
 <a name="l03507"></a>03507             END IF;
 <a name="l03508"></a>03508
 <a name="l03509"></a>03509             SELECT MADLIB_SCHEMA.__format(
 <a name="l03510"></a>03510                 &#39;INSERT INTO %
 <a name="l03511"></a>03511                 SELECT pt.id,
 <a name="l03512"></a>03512                 CASE WHEN (is_cont) THEN
 <a name="l03513"></a>03513                         CASE WHEN (gt.lmc_nid IS NULL) THEN
 <a name="l03514"></a>03514                             0
 <a name="l03515"></a>03515                         ELSE
 <a name="l03516"></a>03516                             gt.lmc_nid +
 <a name="l03517"></a>03517                             float8lt(gt.split_value, fvals[gt.feature])::INT4
 <a name="l03518"></a>03518                             + 1 - gt.lmc_fval
 <a name="l03519"></a>03519                         END
 <a name="l03520"></a>03520                     ELSE
 <a name="l03521"></a>03521                         CASE WHEN (gt.lmc_nid IS NULL) THEN
 <a name="l03522"></a>03522                             0
 <a name="l03523"></a>03523                         ELSE
 <a name="l03524"></a>03524                             gt.lmc_nid + fvals[gt.feature] - gt.lmc_fval
 <a name="l03525"></a>03525                         END
 <a name="l03526"></a>03526                     END as newjump,
 <a name="l03527"></a>03527                 gt.max_class, gt.probability, gt.parent_id, gt.id
 <a name="l03528"></a>03528                 FROM
 <a name="l03529"></a>03529                 (
 <a name="l03530"></a>03530                     SELECT t1.id, t1.jump, fvals
 <a name="l03531"></a>03531                     FROM % t1
 <a name="l03532"></a>03532                     LEFT JOIN % t2
 <a name="l03533"></a>03533                     ON t1.id = t2.id
 <a name="l03534"></a>03534                 ) AS pt,
 <a name="l03535"></a>03535                 (
 <a name="l03536"></a>03536                     SELECT  lmc_nid, lmc_fval, max_class, feature, probability,
 <a name="l03537"></a>03537                             parent_id, id, is_cont, split_value
 <a name="l03538"></a>03538                     FROM %
 <a name="l03539"></a>03539                     WHERE array_upper(tree_location,1) = % AND tid=%
 <a name="l03540"></a>03540                 ) AS gt
 <a name="l03541"></a>03541                 WHERE pt.jump = gt.id;&#39;,
 <a name="l03542"></a>03542                 ARRAY[
 <a name="l03543"></a>03543                     table_names[table_pick],
 <a name="l03544"></a>03544                     table_names[(table_pick) % 2 + 1],
 <a name="l03545"></a>03545                     encoded_table_name,
 <a name="l03546"></a>03546                     tree_table_name,
 <a name="l03547"></a>03547                     MADLIB_SCHEMA.__to_char(curr_level),
 <a name="l03548"></a>03548                     MADLIB_SCHEMA.__to_char(tree_id)
 <a name="l03549"></a>03549                 ]
 <a name="l03550"></a>03550                 )
 <a name="l03551"></a>03551             INTO curstmt;
 <a name="l03552"></a>03552             EXECUTE curstmt;
 <a name="l03553"></a>03553
 <a name="l03554"></a>03554             <span class="comment">/*</span>
 <a name="l03555"></a>03555 <span class="comment">             *  if the node (whose id is &quot;jump&quot;) doesn&#39;t exist, </span>
 <a name="l03556"></a>03556 <span class="comment">             *  then insert them into result table </span>
 <a name="l03557"></a>03557 <span class="comment">             *  (be classified to max_class of its corrsponding node)</span>
 <a name="l03558"></a>03558 <span class="comment">             */</span>
 <a name="l03559"></a>03559             SELECT MADLIB_SCHEMA.__format(
 <a name="l03560"></a>03560                 &#39;INSERT INTO %(tid,id, jump, class, prob, parent_id, leaf_id)
 <a name="l03561"></a>03561                  SELECT &#39;||tree_id||&#39;,id, 0, class, prob, parent_id, leaf_id
 <a name="l03562"></a>03562                  FROM %
 <a name="l03563"></a>03563                  WHERE jump NOT IN (SELECT id FROM % WHERE tid=%)&#39;,
 <a name="l03564"></a>03564                 ARRAY[
 <a name="l03565"></a>03565                     result_table_name,
 <a name="l03566"></a>03566                     table_names[table_pick],
 <a name="l03567"></a>03567                     tree_table_name,
 <a name="l03568"></a>03568                     MADLIB_SCHEMA.__to_char(tree_id)
 <a name="l03569"></a>03569                     ]
 <a name="l03570"></a>03570                 )
 <a name="l03571"></a>03571             INTO curstmt;
 <a name="l03572"></a>03572             EXECUTE curstmt;
 <a name="l03573"></a>03573
 <a name="l03574"></a>03574             -- delete from the being classified data table
 <a name="l03575"></a>03575             SELECT MADLIB_SCHEMA.__format(
 <a name="l03576"></a>03576                 &#39;DELETE FROM %
 <a name="l03577"></a>03577                  WHERE jump NOT IN (SELECT id FROM % WHERE tid=%)&#39;,
 <a name="l03578"></a>03578                 ARRAY[
 <a name="l03579"></a>03579                     table_names[table_pick],
 <a name="l03580"></a>03580                     tree_table_name,
 <a name="l03581"></a>03581                     MADLIB_SCHEMA.__to_char(tree_id)
 <a name="l03582"></a>03582                     ]
 <a name="l03583"></a>03583                 )
 <a name="l03584"></a>03584             INTO curstmt;
 <a name="l03585"></a>03585             EXECUTE curstmt;
 <a name="l03586"></a>03586         END LOOP;
 <a name="l03587"></a>03587
 <a name="l03588"></a>03588         EXECUTE &#39;INSERT INTO &#39;||result_table_name||&#39; SELECT &#39;||tree_id||&#39;,* FROM &#39;||
 <a name="l03589"></a>03589             table_names[table_pick] ||&#39; WHERE jump = 0;&#39;;
 <a name="l03590"></a>03590         EXECUTE &#39;INSERT INTO &#39;||result_table_name||&#39; SELECT &#39;||tree_id||&#39;,* FROM &#39;||
 <a name="l03591"></a>03591             table_names[table_pick % 2 + 1] ||&#39; WHERE jump = 0;&#39;;
 <a name="l03592"></a>03592     END LOOP;
 <a name="l03593"></a>03593
 <a name="l03594"></a>03594     IF (verbosity &gt; 0) THEN
 <a name="l03595"></a>03595         RAISE INFO &#39;final classification time:%&#39;, clock_timestamp() - time_stamp;
 <a name="l03596"></a>03596     END IF;
 <a name="l03597"></a>03597
 <a name="l03598"></a>03598     RETURN ARRAY[encoded_table_name, result_table_name];
 <a name="l03599"></a>03599 END
 <a name="l03600"></a>03600 $$ LANGUAGE PLPGSQL;
 <a name="l03601"></a>03601
 <a name="l03602"></a>03602
 <a name="l03603"></a>03603 <span class="comment">/*</span>
 <a name="l03604"></a>03604 <span class="comment"> * @brief This function check the accuracy of the trained tree model.</span>
 <a name="l03605"></a>03605 <span class="comment"> * </span>
 <a name="l03606"></a>03606 <span class="comment"> * @param tree_table_name     The name of the tree containing the model.</span>
 <a name="l03607"></a>03607 <span class="comment"> * @param scoring_table_name  The full name of the table/view with the </span>
 <a name="l03608"></a>03608 <span class="comment"> *                            data to be scored.</span>
 <a name="l03609"></a>03609 <span class="comment"> * @param verbosity           &gt; 0 means this function runs in verbose mode.</span>
 <a name="l03610"></a>03610 <span class="comment"> *</span>
 <a name="l03611"></a>03611 <span class="comment"> * @return The estimated accuracy information.</span>
 <a name="l03612"></a>03612 <span class="comment"> *</span>
 <a name="l03613"></a>03613 <span class="comment"> */</span>
 <a name="l03614"></a>03614 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_score
 <a name="l03615"></a>03615     (
 <a name="l03616"></a>03616     tree_table_name             TEXT,
 <a name="l03617"></a>03617     scoring_table_name          TEXT,
 <a name="l03618"></a>03618     verbosity                   INT
 <a name="l03619"></a>03619     )
 <a name="l03620"></a>03620 RETURNS FLOAT AS $$
 <a name="l03621"></a>03621 DECLARE
 <a name="l03622"></a>03622     result_table_name           TEXT;
 <a name="l03623"></a>03623     result_table_name_final TEXT;
 <a name="l03624"></a>03624     id_col_name             TEXT  := &#39;id&#39;;
 <a name="l03625"></a>03625     class_col_name          TEXT  := &#39;class&#39;;
 <a name="l03626"></a>03626     curstmt                 TEXT  := &#39;&#39;;
 <a name="l03627"></a>03627     num_of_row              FLOAT := 0.0;
 <a name="l03628"></a>03628     mis_of_row              FLOAT := 0.0;
 <a name="l03629"></a>03629     encoded_table_name      TEXT  := &#39;&#39;;
 <a name="l03630"></a>03630     table_names                 TEXT[];
 <a name="l03631"></a>03631 BEGIN
 <a name="l03632"></a>03632
 <a name="l03633"></a>03633     IF (verbosity &gt; 0) THEN
 <a name="l03634"></a>03634         -- get rid of the messages whose severity level is lower than &#39;WARNING&#39;
 <a name="l03635"></a>03635         SET client_min_messages = WARNING;
 <a name="l03636"></a>03636     END IF;
 <a name="l03637"></a>03637
 <a name="l03638"></a>03638     PERFORM MADLIB_SCHEMA.__assert
 <a name="l03639"></a>03639             (
 <a name="l03640"></a>03640                 (tree_table_name IS NOT NULL) AND
 <a name="l03641"></a>03641                 (
 <a name="l03642"></a>03642                  MADLIB_SCHEMA.__table_exists
 <a name="l03643"></a>03643                     (
 <a name="l03644"></a>03644                         tree_table_name
 <a name="l03645"></a>03645                     )
 <a name="l03646"></a>03646                 ),
 <a name="l03647"></a>03647                 &#39;the specified tree table&#39; || coalesce(&#39;&lt;&#39; || tree_table_name
 <a name="l03648"></a>03648                 || &#39;&gt; does not exist&#39;, &#39; is NULL&#39;)
 <a name="l03649"></a>03649             );
 <a name="l03650"></a>03650
 <a name="l03651"></a>03651     PERFORM MADLIB_SCHEMA.__assert
 <a name="l03652"></a>03652             (
 <a name="l03653"></a>03653                 (scoring_table_name IS NOT NULL) AND
 <a name="l03654"></a>03654                 (
 <a name="l03655"></a>03655                  MADLIB_SCHEMA.__table_exists
 <a name="l03656"></a>03656                     (
 <a name="l03657"></a>03657                         scoring_table_name
 <a name="l03658"></a>03658                     )
 <a name="l03659"></a>03659                 ),
 <a name="l03660"></a>03660                 &#39;the specified scoring table&#39;      ||
 <a name="l03661"></a>03661                 coalesce(&#39;&lt;&#39; || scoring_table_name ||
 <a name="l03662"></a>03662                 &#39;&gt; does not exist&#39;, &#39; is NULL&#39;)
 <a name="l03663"></a>03663             );
 <a name="l03664"></a>03664
 <a name="l03665"></a>03665     PERFORM MADLIB_SCHEMA.__assert
 <a name="l03666"></a>03666         (
 <a name="l03667"></a>03667             MADLIB_SCHEMA.__column_exists
 <a name="l03668"></a>03668                 (
 <a name="l03669"></a>03669                     scoring_table_name,
 <a name="l03670"></a>03670                     MADLIB_SCHEMA.__get_class_column_name
 <a name="l03671"></a>03671                         (
 <a name="l03672"></a>03672                         MADLIB_SCHEMA.__get_metatable_name(tree_table_name)
 <a name="l03673"></a>03673                         )
 <a name="l03674"></a>03674                 ),
 <a name="l03675"></a>03675             &#39;the specified scoring table&lt;&#39; || scoring_table_name ||
 <a name="l03676"></a>03676             &#39;&gt; does not have class column&#39;
 <a name="l03677"></a>03677         );
 <a name="l03678"></a>03678
 <a name="l03679"></a>03679     table_names = MADLIB_SCHEMA.__treemodel_classify_internal
 <a name="l03680"></a>03680                     (
 <a name="l03681"></a>03681                         scoring_table_name,
 <a name="l03682"></a>03682                         tree_table_name,
 <a name="l03683"></a>03683                         verbosity
 <a name="l03684"></a>03684                     );
 <a name="l03685"></a>03685     encoded_table_name      = table_names[1];
 <a name="l03686"></a>03686     result_table_name       = table_names[2];
 <a name="l03687"></a>03687     result_table_name_final = result_table_name||&#39;_final&#39;;
 <a name="l03688"></a>03688
 <a name="l03689"></a>03689     PERFORM MADLIB_SCHEMA.__treemodel_get_vote_result
 <a name="l03690"></a>03690         (
 <a name="l03691"></a>03691         result_table_name,
 <a name="l03692"></a>03692         result_table_name_final
 <a name="l03693"></a>03693         );
 <a name="l03694"></a>03694
 <a name="l03695"></a>03695     SELECT MADLIB_SCHEMA.__format
 <a name="l03696"></a>03696         (
 <a name="l03697"></a>03697         &#39;SELECT count(id) FROM %;&#39;,
 <a name="l03698"></a>03698         result_table_name_final
 <a name="l03699"></a>03699         )
 <a name="l03700"></a>03700     INTO curstmt;
 <a name="l03701"></a>03701
 <a name="l03702"></a>03702     EXECUTE curstmt INTO num_of_row;
 <a name="l03703"></a>03703
 <a name="l03704"></a>03704     SELECT MADLIB_SCHEMA.__format
 <a name="l03705"></a>03705         (
 <a name="l03706"></a>03706         &#39;SELECT count(t2.id)
 <a name="l03707"></a>03707      FROM % t1, % t2
 <a name="l03708"></a>03708          WHERE t1.% = t2.id AND t1.% &lt;&gt; t2.class&#39;,
 <a name="l03709"></a>03709         ARRAY[
 <a name="l03710"></a>03710             encoded_table_name,
 <a name="l03711"></a>03711             result_table_name_final,
 <a name="l03712"></a>03712             id_col_name,
 <a name="l03713"></a>03713             class_col_name
 <a name="l03714"></a>03714         ]
 <a name="l03715"></a>03715         )
 <a name="l03716"></a>03716     INTO curstmt;
 <a name="l03717"></a>03717
 <a name="l03718"></a>03718     EXECUTE curstmt INTO mis_of_row;
 <a name="l03719"></a>03719
 <a name="l03720"></a>03720     EXECUTE &#39;DROP TABLE IF EXISTS &#39; || encoded_table_name || &#39;;&#39;;
 <a name="l03721"></a>03721     EXECUTE &#39;DROP TABLE IF EXISTS &#39; || result_table_name || &#39;;&#39;;
 <a name="l03722"></a>03722     EXECUTE &#39;DROP TABLE IF EXISTS &#39; || result_table_name_final || &#39;;&#39;;
 <a name="l03723"></a>03723     RETURN (num_of_row - mis_of_row) / num_of_row;
 <a name="l03724"></a>03724 END;
 <a name="l03725"></a>03725 $$ LANGUAGE PLPGSQL;
 <a name="l03726"></a>03726
 <a name="l03727"></a>03727
 <a name="l03728"></a>03728 <span class="comment">/*</span>
 <a name="l03729"></a>03729 <span class="comment"> * @brief Cleanup the trained model table and any relevant tables.</span>
 <a name="l03730"></a>03730 <span class="comment"> *</span>
 <a name="l03731"></a>03731 <span class="comment"> * @param model_table_name The name of the table containing</span>
 <a name="l03732"></a>03732 <span class="comment"> *                         the model&#39;s information.</span>
 <a name="l03733"></a>03733 <span class="comment"> *</span>
 <a name="l03734"></a>03734 <span class="comment"> * @return The status of that cleanup operation.</span>
 <a name="l03735"></a>03735 <span class="comment"> *</span>
 <a name="l03736"></a>03736 <span class="comment"> */</span>
 <a name="l03737"></a>03737 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_clean
 <a name="l03738"></a>03738     (
 <a name="l03739"></a>03739     model_table_name TEXT
 <a name="l03740"></a>03740     )
 <a name="l03741"></a>03741 RETURNS BOOLEAN AS $$
 <a name="l03742"></a>03742 DECLARE
 <a name="l03743"></a>03743     metatable_name TEXT;
 <a name="l03744"></a>03744     ref_count      INT;
 <a name="l03745"></a>03745 BEGIN
 <a name="l03746"></a>03746     -- get rid of the messages whose severity level is lower than &#39;WARNING&#39;
 <a name="l03747"></a>03747     SET client_min_messages = WARNING;
 <a name="l03748"></a>03748
 <a name="l03749"></a>03749     PERFORM MADLIB_SCHEMA.__assert
 <a name="l03750"></a>03750             (
 <a name="l03751"></a>03751                 (model_table_name IS NOT NULL) AND
 <a name="l03752"></a>03752                 (
 <a name="l03753"></a>03753                  MADLIB_SCHEMA.__table_exists
 <a name="l03754"></a>03754                     (
 <a name="l03755"></a>03755                         model_table_name
 <a name="l03756"></a>03756                     )
 <a name="l03757"></a>03757                 ),
 <a name="l03758"></a>03758                 &#39;the specified tree table&#39;      ||
 <a name="l03759"></a>03759                 coalesce(&#39;&lt;&#39;                    ||
 <a name="l03760"></a>03760                 model_table_name                ||
 <a name="l03761"></a>03761                 &#39;&gt; does not exists&#39;, &#39; is NULL&#39;)
 <a name="l03762"></a>03762             );
 <a name="l03763"></a>03763
 <a name="l03764"></a>03764     IF (MADLIB_SCHEMA.__table_exists(&#39;MADLIB_SCHEMA.training_info&#39;)) THEN
 <a name="l03765"></a>03765         metatable_name = MADLIB_SCHEMA.__get_metatable_name(model_table_name);
 <a name="l03766"></a>03766         IF( metatable_name IS NOT NULL) THEN
 <a name="l03767"></a>03767             SELECT count(*)
 <a name="l03768"></a>03768             FROM MADLIB_SCHEMA.training_info
 <a name="l03769"></a>03769             WHERE training_metatable_oid = metatable_name::regclass
 <a name="l03770"></a>03770             INTO ref_count;
 <a name="l03771"></a>03771
 <a name="l03772"></a>03772             -- if the metatable is not referenced by other training procedure.
 <a name="l03773"></a>03773             IF (ref_count = 1) THEN
 <a name="l03774"></a>03774                 PERFORM MADLIB_SCHEMA.__drop_metatable(metatable_name);
 <a name="l03775"></a>03775                 EXECUTE &#39;DROP TABLE IF EXISTS &#39; ||
 <a name="l03776"></a>03776                      MADLIB_SCHEMA.__get_encode_table_name(model_table_name) || &#39;;&#39;;
 <a name="l03777"></a>03777             END IF;
 <a name="l03778"></a>03778         END IF;
 <a name="l03779"></a>03779
 <a name="l03780"></a>03780         -- remove the record first, and then drop the table
 <a name="l03781"></a>03781         PERFORM MADLIB_SCHEMA.__delete_traininginfo(model_table_name);
 <a name="l03782"></a>03782         EXECUTE &#39;DROP TABLE IF EXISTS &#39; || model_table_name;
 <a name="l03783"></a>03783
 <a name="l03784"></a>03784     ELSE
 <a name="l03785"></a>03785         EXECUTE &#39;DROP TABLE IF EXISTS &#39; || model_table_name;
 <a name="l03786"></a>03786     END IF;
 <a name="l03787"></a>03787
 <a name="l03788"></a>03788     RETURN &#39;t&#39;;
 <a name="l03789"></a>03789 END
 <a name="l03790"></a>03790 $$ LANGUAGE PLPGSQL;
 <a name="l03791"></a>03791
 <a name="l03792"></a>03792 <span class="comment">/*</span>
 <a name="l03793"></a>03793 <span class="comment"> * @brief Validate the common parameters for C4.5 and RF API.</span>
 <a name="l03794"></a>03794 <span class="comment"> *</span>
 <a name="l03795"></a>03795 <span class="comment"> * @param split_criterion           The name of the split criterion that should be used </span>
 <a name="l03796"></a>03796 <span class="comment"> *                                  for tree construction. The valid values are</span>
 <a name="l03797"></a>03797 <span class="comment"> *                                  ‘infogain’, ‘gainratio’, and ‘gini’. It can&#39;t be NULL.</span>
 <a name="l03798"></a>03798 <span class="comment"> * @param training_table_name       The name of the table/view with the source data.</span>
 <a name="l03799"></a>03799 <span class="comment"> * @param result_table_name         The name of the table where the resulting DT </span>
 <a name="l03800"></a>03800 <span class="comment"> *                                  will be kept.</span>
 <a name="l03801"></a>03801 <span class="comment"> * @param continuous_feature_names  A comma-separated list of the names of features whose values </span>
 <a name="l03802"></a>03802 <span class="comment"> *                                  are continuous. The default is null, which means there are </span>
 <a name="l03803"></a>03803 <span class="comment"> *                                  no continuous features in the training table.</span>
 <a name="l03804"></a>03804 <span class="comment"> * @param feature_col_names         A comma-separated list of the names of table columns, each of</span>
 <a name="l03805"></a>03805 <span class="comment"> *                                  which defines a feature. The default value is null, which means </span>
 <a name="l03806"></a>03806 <span class="comment"> *                                  all the columns in the training table, except columns named </span>
 <a name="l03807"></a>03807 <span class="comment"> *                                   ‘id’ and ‘class’, will be used as features.</span>
 <a name="l03808"></a>03808 <span class="comment"> * @param id_col_name               The name of the column containing an ID for each record.</span>
 <a name="l03809"></a>03809 <span class="comment"> * @param class_col_name            The name of the column containing the labeled class. </span>
 <a name="l03810"></a>03810 <span class="comment"> * @param how2handle_missing_value  The way to handle missing value. The valid value </span>
 <a name="l03811"></a>03811 <span class="comment"> *                                  is &#39;explicit&#39; or &#39;ignore&#39;.</span>
 <a name="l03812"></a>03812 <span class="comment"> * @param max_tree_depth            Specifies the maximum number of levels in the result DT </span>
 <a name="l03813"></a>03813 <span class="comment"> *                                  to avoid overgrown DTs. </span>
 <a name="l03814"></a>03814 <span class="comment"> * @param node_prune_threshold      The minimum percentage of the number of records required in a</span>
 <a name="l03815"></a>03815 <span class="comment"> *                                  child node. It can&#39;t be NULL. The range of it is in [0.0, 1.0].</span>
 <a name="l03816"></a>03816 <span class="comment"> *                                  This threshold only applies to the non-root nodes. Therefore,</span>
 <a name="l03817"></a>03817 <span class="comment"> *                                  if its value is 1, then the trained tree only has one node (the root node);</span>
 <a name="l03818"></a>03818 <span class="comment"> *                                  if its value is 0, then no nodes will be pruned by this parameter.</span>
 <a name="l03819"></a>03819 <span class="comment"> * @param node_split_threshold      The minimum percentage of the number of records required in a</span>
 <a name="l03820"></a>03820 <span class="comment"> *                                  node in order for a further split to be possible.</span>
 <a name="l03821"></a>03821 <span class="comment"> *                                  It can&#39;t be NULL. The range of it is in [0.0, 1.0].</span>
 <a name="l03822"></a>03822 <span class="comment"> *                                  If it&#39;s value is 1, then the trained tree only has two levels, since</span>
 <a name="l03823"></a>03823 <span class="comment"> *                                  only the root node can grow; if its value is 0, then trees can grow</span>
 <a name="l03824"></a>03824 <span class="comment"> *                                  extensively.</span>
 <a name="l03825"></a>03825 <span class="comment"> * @param verbosity                 &gt; 0 means this function runs in verbose mode.   </span>
 <a name="l03826"></a>03826 <span class="comment"> * @param error_msg                 The reported error message when result_table_name is invalid.</span>
 <a name="l03827"></a>03827 <span class="comment"> *</span>
 <a name="l03828"></a>03828 <span class="comment"> */</span>
 <a name="l03829"></a>03829 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__check_dt_common_params
 <a name="l03830"></a>03830     (
 <a name="l03831"></a>03831     split_criterion             TEXT,
 <a name="l03832"></a>03832     training_table_name         TEXT,
 <a name="l03833"></a>03833     result_table_name           TEXT,
 <a name="l03834"></a>03834     continuous_feature_names    TEXT,
 <a name="l03835"></a>03835     feature_col_names           TEXT,
 <a name="l03836"></a>03836     id_col_name                 TEXT,
 <a name="l03837"></a>03837     class_col_name              TEXT,
 <a name="l03838"></a>03838     how2handle_missing_value    TEXT,
 <a name="l03839"></a>03839     max_tree_depth              INT,
 <a name="l03840"></a>03840     node_prune_threshold        FLOAT,
 <a name="l03841"></a>03841     node_split_threshold        FLOAT,
 <a name="l03842"></a>03842     verbosity                   INT,
 <a name="l03843"></a>03843     error_msg                   TEXT
 <a name="l03844"></a>03844     )
 <a name="l03845"></a>03845 RETURNS void AS $$
 <a name="l03846"></a>03846 DECLARE
 <a name="l03847"></a>03847     num_of_element  BIGINT;
 <a name="l03848"></a>03848 BEGIN
 <a name="l03849"></a>03849     PERFORM MADLIB_SCHEMA.__assert
 <a name="l03850"></a>03850         (
 <a name="l03851"></a>03851             (split_criterion IS NOT NULL)   AND
 <a name="l03852"></a>03852             (
 <a name="l03853"></a>03853              split_criterion = &#39;infogain&#39;   OR
 <a name="l03854"></a>03854              split_criterion = &#39;gainratio&#39;  OR
 <a name="l03855"></a>03855              split_criterion = &#39;gini&#39;
 <a name="l03856"></a>03856             ),
 <a name="l03857"></a>03857             &#39;split_criterion must be infogain, gainratio or gini&#39;
 <a name="l03858"></a>03858         );
 <a name="l03859"></a>03859
 <a name="l03860"></a>03860     PERFORM MADLIB_SCHEMA.__assert
 <a name="l03861"></a>03861         (
 <a name="l03862"></a>03862             how2handle_missing_value = &#39;ignore&#39; OR
 <a name="l03863"></a>03863             how2handle_missing_value = &#39;explicit&#39;,
 <a name="l03864"></a>03864             &#39;how2handle_missing_value must be ignore or explicit!&#39;
 <a name="l03865"></a>03865         );
 <a name="l03866"></a>03866
 <a name="l03867"></a>03867     PERFORM MADLIB_SCHEMA.__assert
 <a name="l03868"></a>03868         (
 <a name="l03869"></a>03869             max_tree_depth IS NOT NULL    AND
 <a name="l03870"></a>03870             max_tree_depth &gt; 0,
 <a name="l03871"></a>03871             &#39;max_tree_depth value must be greater than 0&#39;
 <a name="l03872"></a>03872         );
 <a name="l03873"></a>03873
 <a name="l03874"></a>03874     PERFORM MADLIB_SCHEMA.__assert
 <a name="l03875"></a>03875         (
 <a name="l03876"></a>03876             node_prune_threshold IS NOT NULL    AND
 <a name="l03877"></a>03877             float8ge(node_prune_threshold, 0)   AND
 <a name="l03878"></a>03878             float8le(node_prune_threshold, 1),
 <a name="l03879"></a>03879             &#39;node_prune_threshold value must be in range from 0 to 1&#39;
 <a name="l03880"></a>03880         );
 <a name="l03881"></a>03881
 <a name="l03882"></a>03882     PERFORM MADLIB_SCHEMA.__assert
 <a name="l03883"></a>03883         (
 <a name="l03884"></a>03884             node_split_threshold IS NOT NULL   AND
 <a name="l03885"></a>03885             float8ge(node_split_threshold, 0)  AND
 <a name="l03886"></a>03886             float8le(node_split_threshold, 1),
 <a name="l03887"></a>03887             &#39;node_split_threshold value must be in range from 0 to 1&#39;
 <a name="l03888"></a>03888         );
 <a name="l03889"></a>03889
 <a name="l03890"></a>03890     PERFORM MADLIB_SCHEMA.__assert
 <a name="l03891"></a>03891         (
 <a name="l03892"></a>03892             verbosity IS NOT NULL,
 <a name="l03893"></a>03893             &#39;verbosity must be non-null&#39;
 <a name="l03894"></a>03894         );
 <a name="l03895"></a>03895
 <a name="l03896"></a>03896     PERFORM MADLIB_SCHEMA.__assert
 <a name="l03897"></a>03897         (
 <a name="l03898"></a>03898             id_col_name IS NOT NULL AND
 <a name="l03899"></a>03899             class_col_name IS NOT NULL AND
 <a name="l03900"></a>03900             length(btrim(id_col_name, &#39; &#39;)) &gt; 0 AND
 <a name="l03901"></a>03901             length(btrim(class_col_name, &#39; &#39;)) &gt; 0,
 <a name="l03902"></a>03902             &#39;invalid id column name or class column name&#39;
 <a name="l03903"></a>03903         );
 <a name="l03904"></a>03904
 <a name="l03905"></a>03905     PERFORM MADLIB_SCHEMA.__assert
 <a name="l03906"></a>03906         (
 <a name="l03907"></a>03907             training_table_name IS NOT NULL AND
 <a name="l03908"></a>03908             MADLIB_SCHEMA.__table_exists
 <a name="l03909"></a>03909                 (
 <a name="l03910"></a>03910                     training_table_name
 <a name="l03911"></a>03911                 ),
 <a name="l03912"></a>03912             &#39;the specified training table&#39; ||
 <a name="l03913"></a>03913             coalesce(&#39;&lt;&#39;                   ||
 <a name="l03914"></a>03914             training_table_name            ||
 <a name="l03915"></a>03915             &#39;&gt; does not exist&#39;, &#39; is NULL&#39;)
 <a name="l03916"></a>03916         );
 <a name="l03917"></a>03917
 <a name="l03918"></a>03918     EXECUTE &#39;SELECT count(*) FROM
 <a name="l03919"></a>03919                 (SELECT * FROM &#39;||training_table_name||&#39; LIMIT 1) l&#39;
 <a name="l03920"></a>03920         INTO num_of_element;
 <a name="l03921"></a>03921
 <a name="l03922"></a>03922     PERFORM MADLIB_SCHEMA.__assert
 <a name="l03923"></a>03923             (
 <a name="l03924"></a>03924                 num_of_element &gt; 0,
 <a name="l03925"></a>03925                 &#39;the specified training table &lt;&#39;||training_table_name||
 <a name="l03926"></a>03926                 &#39;&gt; should not be empty&#39;
 <a name="l03927"></a>03927             );
 <a name="l03928"></a>03928
 <a name="l03929"></a>03929
 <a name="l03930"></a>03930     PERFORM MADLIB_SCHEMA.__assert
 <a name="l03931"></a>03931             (
 <a name="l03932"></a>03932                 result_table_name IS NOT NULL,
 <a name="l03933"></a>03933                 &#39;the specified result &#39; || error_msg ||  &#39; table name is NULL&#39;
 <a name="l03934"></a>03934             );
 <a name="l03935"></a>03935
 <a name="l03936"></a>03936     PERFORM MADLIB_SCHEMA.__assert
 <a name="l03937"></a>03937             (
 <a name="l03938"></a>03938                 NOT MADLIB_SCHEMA.__table_exists
 <a name="l03939"></a>03939                     (
 <a name="l03940"></a>03940                         result_table_name
 <a name="l03941"></a>03941                     )
 <a name="l03942"></a>03942                 ,
 <a name="l03943"></a>03943                 &#39;the specified result &#39; || error_msg || &#39; table&lt;&#39; ||
 <a name="l03944"></a>03944                 result_table_name ||
 <a name="l03945"></a>03945                 &#39;&gt; exists&#39;
 <a name="l03946"></a>03946             );
 <a name="l03947"></a>03947 END
 <a name="l03948"></a>03948 $$ LANGUAGE PLPGSQL STABLE;
 <a name="l03949"></a>03949
 <a name="l03950"></a>03950
 <a name="l03951"></a>03951 <span class="comment">/*</span>
 <a name="l03952"></a>03952 <span class="comment"> * @brief Get the name of the encoded table and the name of</span>
 <a name="l03953"></a>03953 <span class="comment"> *        its meta table.</span>
 <a name="l03954"></a>03954 <span class="comment"> * @param result_table_name   The name of the table where the </span>
 <a name="l03955"></a>03955 <span class="comment"> *                            resulting DT will be kept </span>
 <a name="l03956"></a>03956 <span class="comment"> * @param error_msg           The reported error message when the</span>
 <a name="l03957"></a>03957 <span class="comment"> *                            length of result schema name plus</span>
 <a name="l03958"></a>03958 <span class="comment"> *                            the length of result table name is</span>
 <a name="l03959"></a>03959 <span class="comment"> *                            larger than 58.</span>
 <a name="l03960"></a>03960 <span class="comment"> * </span>
 <a name="l03961"></a>03961 <span class="comment"> * @return A text array that contains two elements. The firest element</span>
 <a name="l03962"></a>03962 <span class="comment"> *        is the encoded table name and the second is the meta table name.</span>
 <a name="l03963"></a>03963 <span class="comment"> *                            </span>
 <a name="l03964"></a>03964 <span class="comment"> */</span>
 <a name="l03965"></a>03965 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__gen_enc_meta_names
 <a name="l03966"></a>03966     (
 <a name="l03967"></a>03967     result_table_name     TEXT,
 <a name="l03968"></a>03968     error_msg             TEXT
 <a name="l03969"></a>03969     )
 <a name="l03970"></a>03970 RETURNS TEXT[] AS $$
 <a name="l03971"></a>03971 DECLARE
 <a name="l03972"></a>03972     result_schema_name    TEXT;
 <a name="l03973"></a>03973     table_names           TEXT[];
 <a name="l03974"></a>03974 BEGIN
 <a name="l03975"></a>03975     result_schema_name = MADLIB_SCHEMA.__get_schema_name(result_table_name);
 <a name="l03976"></a>03976
 <a name="l03977"></a>03977     -- the maximum length of an identifier 63
 <a name="l03978"></a>03978     -- encoding table name convension:  &lt;schema name&gt;_&lt;table name&gt;_ed
 <a name="l03979"></a>03979     -- data info table name convension: &lt;schema name&gt;_&lt;table name&gt;_di
 <a name="l03980"></a>03980     -- the KV table name convension:    &lt;schema name&gt;_&lt;table name&gt;_&lt;####&gt;
 <a name="l03981"></a>03981     -- therefore, the maximum length of &#39;&lt;schema name&gt;_&lt;table name&gt;&#39; is 58
 <a name="l03982"></a>03982     PERFORM MADLIB_SCHEMA.__assert
 <a name="l03983"></a>03983         (
 <a name="l03984"></a>03984             length(
 <a name="l03985"></a>03985                 result_schema_name      ||
 <a name="l03986"></a>03986                 &#39;_&#39;                   ||
 <a name="l03987"></a>03987                 result_table_name) &lt;= 58,
 <a name="l03988"></a>03988             &#39;the maximum length of &#39;&#39;&#39; || error_msg || &#39;&#39;&#39; is 58&#39;
 <a name="l03989"></a>03989         );
 <a name="l03990"></a>03990
 <a name="l03991"></a>03991     -- the encoded table and meta table will be under the specified schema
 <a name="l03992"></a>03992     table_names[1]  = result_schema_name                      ||
 <a name="l03993"></a>03993                     &#39;.&#39;                                   ||
 <a name="l03994"></a>03994                     replace(result_table_name, &#39;.&#39;, &#39;_&#39;)    ||
 <a name="l03995"></a>03995                     &#39;_ed&#39;;
 <a name="l03996"></a>03996     table_names[2] = result_schema_name                      ||
 <a name="l03997"></a>03997                     &#39;.&#39;                                   ||
 <a name="l03998"></a>03998                     replace(result_table_name, &#39;.&#39;, &#39;_&#39;)    ||
 <a name="l03999"></a>03999                     &#39;_di&#39;;
 <a name="l04000"></a>04000     RETURN table_names;
 <a name="l04001"></a>04001 END
 <a name="l04002"></a>04002 $$ LANGUAGE PLPGSQL STABLE;
 <a name="l04003"></a>04003
 <a name="l04004"></a>04004
 <a name="l04005"></a>04005 <span class="comment">/*</span>
 <a name="l04006"></a>04006 <span class="comment"> * @brief Validate if the provided columns are in the training table or not.</span>
 <a name="l04007"></a>04007 <span class="comment"> *</span>
 <a name="l04008"></a>04008 <span class="comment"> * @param training_table_name       The name of the table/view with the source data.</span>
 <a name="l04009"></a>04009 <span class="comment"> * @param continuous_feature_names  A text array that contains all the continuous </span>
 <a name="l04010"></a>04010 <span class="comment"> *                                  features&#39; names. </span>
 <a name="l04011"></a>04011 <span class="comment"> * @param feature_col_names         A text array that contains all the features&#39; names.</span>
 <a name="l04012"></a>04012 <span class="comment"> * @param id_col_name               The name of the column containing an ID for each record.</span>
 <a name="l04013"></a>04013 <span class="comment"> * @param class_col_name            The name of the column containing the labeled class. </span>
 <a name="l04014"></a>04014 <span class="comment"> * @param features_per_node         The number of features to be considered when finding </span>
 <a name="l04015"></a>04015 <span class="comment"> *                                  a best split.</span>
 <a name="l04016"></a>04016 <span class="comment"> */</span>
 <a name="l04017"></a>04017 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__check_training_table
 <a name="l04018"></a>04018     (
 <a name="l04019"></a>04019     training_table_name         TEXT,
 <a name="l04020"></a>04020     continuous_feature_names    TEXT[],
 <a name="l04021"></a>04021     feature_col_names           TEXT[],
 <a name="l04022"></a>04022     id_col_name                 TEXT,
 <a name="l04023"></a>04023     class_col_name              TEXT,
 <a name="l04024"></a>04024     features_per_node           INT
 <a name="l04025"></a>04025     )
 <a name="l04026"></a>04026 RETURNS VOID AS $$
 <a name="l04027"></a>04027 DECLARE
 <a name="l04028"></a>04028     num_attrs                 INT;
 <a name="l04029"></a>04029 BEGIN
 <a name="l04030"></a>04030     PERFORM MADLIB_SCHEMA.__assert
 <a name="l04031"></a>04031         (
 <a name="l04032"></a>04032             MADLIB_SCHEMA.__column_exists
 <a name="l04033"></a>04033                 (
 <a name="l04034"></a>04034                     training_table_name,
 <a name="l04035"></a>04035                     lower(btrim(id_col_name, &#39; &#39;))
 <a name="l04036"></a>04036                 ),
 <a name="l04037"></a>04037             &#39;the specified training table&lt;&#39; ||
 <a name="l04038"></a>04038             training_table_name             ||
 <a name="l04039"></a>04039             &#39;&gt; does not have column &#39;&#39;&#39;     ||
 <a name="l04040"></a>04040             id_col_name                     ||
 <a name="l04041"></a>04041             &#39;&#39;&#39;&#39;
 <a name="l04042"></a>04042         );
 <a name="l04043"></a>04043
 <a name="l04044"></a>04044     PERFORM MADLIB_SCHEMA.__assert
 <a name="l04045"></a>04045         (
 <a name="l04046"></a>04046             MADLIB_SCHEMA.__column_exists
 <a name="l04047"></a>04047                 (
 <a name="l04048"></a>04048                     training_table_name,
 <a name="l04049"></a>04049                     lower(btrim(class_col_name, &#39; &#39;))
 <a name="l04050"></a>04050                 ),
 <a name="l04051"></a>04051             &#39;the specified training table&lt;&#39;     ||
 <a name="l04052"></a>04052             training_table_name                 ||
 <a name="l04053"></a>04053             &#39;&gt; does not have column &#39;&#39;&#39;         ||
 <a name="l04054"></a>04054             class_col_name                      ||
 <a name="l04055"></a>04055             &#39;&#39;&#39;&#39;
 <a name="l04056"></a>04056         );
 <a name="l04057"></a>04057
 <a name="l04058"></a>04058     IF (feature_col_names IS NULL) THEN
 <a name="l04059"></a>04059         -- 2 means the id and class column
 <a name="l04060"></a>04060         num_attrs = MADLIB_SCHEMA.__num_of_columns(training_table_name) - 2;
 <a name="l04061"></a>04061
 <a name="l04062"></a>04062         PERFORM MADLIB_SCHEMA.__assert
 <a name="l04063"></a>04063             (
 <a name="l04064"></a>04064                 (features_per_node IS NULL AND num_attrs &gt; 0)  OR
 <a name="l04065"></a>04065                 (features_per_node IS NOT NULL AND num_attrs &gt;= features_per_node),
 <a name="l04066"></a>04066                 &#39;the value of features_per_node must be less than or equal to the total number &#39; ||
 <a name="l04067"></a>04067                 &#39;of features of the training table&#39;
 <a name="l04068"></a>04068            );
 <a name="l04069"></a>04069         PERFORM MADLIB_SCHEMA.__assert
 <a name="l04070"></a>04070             (
 <a name="l04071"></a>04071                 MADLIB_SCHEMA.__columns_in_table(continuous_feature_names, training_table_name),
 <a name="l04072"></a>04072                 &#39;each feature in continuous_feature_names must be a column of the training table&#39;
 <a name="l04073"></a>04073             );
 <a name="l04074"></a>04074     ELSE
 <a name="l04075"></a>04075         num_attrs = array_upper(feature_col_names, 1);
 <a name="l04076"></a>04076         PERFORM MADLIB_SCHEMA.__assert
 <a name="l04077"></a>04077             (
 <a name="l04078"></a>04078                 (features_per_node IS NULL AND num_attrs &gt; 0) OR
 <a name="l04079"></a>04079                 (features_per_node IS NOT NULL AND num_attrs &gt;= features_per_node),
 <a name="l04080"></a>04080                 &#39;the value of features_per_node must be less than or equal to the total number &#39; ||
 <a name="l04081"></a>04081                 &#39;of features of the training table&#39;
 <a name="l04082"></a>04082            );
 <a name="l04083"></a>04083         PERFORM MADLIB_SCHEMA.__assert
 <a name="l04084"></a>04084             (
 <a name="l04085"></a>04085                 MADLIB_SCHEMA.__columns_in_table(feature_col_names, training_table_name),
 <a name="l04086"></a>04086                 &#39;each feature in feature_col_names must be a column of the training table&#39;
 <a name="l04087"></a>04087             );
 <a name="l04088"></a>04088
 <a name="l04089"></a>04089         PERFORM MADLIB_SCHEMA.__assert
 <a name="l04090"></a>04090             (
 <a name="l04091"></a>04091                 coalesce(continuous_feature_names, &#39;{}&#39;::TEXT[]) &lt;@ feature_col_names,
 <a name="l04092"></a>04092                 &#39;each feature in continuous_feature_names must be in the feature_col_names&#39;
 <a name="l04093"></a>04093             );
 <a name="l04094"></a>04094     END IF;
 <a name="l04095"></a>04095 END
 <a name="l04096"></a>04096 $$ LANGUAGE PLPGSQL STABLE;
 <a name="l04097"></a>04097
 <a name="l04098"></a>04098
 <a name="l04099"></a>04099 <span class="comment">/* @ brief If the training table is a valid encoded table, then we use it directly.</span>
 <a name="l04100"></a>04100 <span class="comment"> *         If the training table is not encoded, then we invoke the encoding procedure</span>
 <a name="l04101"></a>04101 <span class="comment"> *         to transform the training table. </span>
 <a name="l04102"></a>04102 <span class="comment"> *         With the encoded table, we call the tree grow engine to generate the final tree.</span>
 <a name="l04103"></a>04103 <span class="comment"> *</span>
 <a name="l04104"></a>04104 <span class="comment"> * @param dt_algo_name                The name of the algorithom. Currently, it&#39;s</span>
 <a name="l04105"></a>04105 <span class="comment"> *                                    &#39;C4.5&#39; or &#39;RF&#39;</span>
 <a name="l04106"></a>04106 <span class="comment"> * @param split_criterion             This parameter specifies which split criterion </span>
 <a name="l04107"></a>04107 <span class="comment"> *                                    should be used for tree construction and </span>
 <a name="l04108"></a>04108 <span class="comment"> *                                    pruning. The valid values are infogain, </span>
 <a name="l04109"></a>04109 <span class="comment"> *                                    gainratio, and gini.</span>
 <a name="l04110"></a>04110 <span class="comment"> * @param num_trees                   Total number of trees to be trained. </span>
 <a name="l04111"></a>04111 <span class="comment"> * @param features_per_node           Total number of features used to compute split </span>
 <a name="l04112"></a>04112 <span class="comment"> *                                    gain for each node. </span>
 <a name="l04113"></a>04113 <span class="comment"> * @param training_table_name         The name of the table/view with the source data. </span>
 <a name="l04114"></a>04114 <span class="comment"> * @param validation_table_name       The name of the validation table. </span>
 <a name="l04115"></a>04115 <span class="comment"> * @param tree_table_name             The name of the table where the resulting </span>
 <a name="l04116"></a>04116 <span class="comment"> *                                    DT/RF will be stored. </span>
 <a name="l04117"></a>04117 <span class="comment"> * @param continuous_feature_names    A comma-separated list of the names of features whose values </span>
 <a name="l04118"></a>04118 <span class="comment"> *                                    are continuous. The default is null, which means there are </span>
 <a name="l04119"></a>04119 <span class="comment"> *                                    no continuous features in the training table.</span>
 <a name="l04120"></a>04120 <span class="comment"> * @param feature_col_names           A comma-separated list of the names of table columns, each of</span>
 <a name="l04121"></a>04121 <span class="comment"> *                                    which defines a feature. The default value is null, which means </span>
 <a name="l04122"></a>04122 <span class="comment"> *                                    all the columns in the training table, except columns named </span>
 <a name="l04123"></a>04123 <span class="comment"> *                                   ‘id’ and ‘class’, will be used as features.</span>
 <a name="l04124"></a>04124 <span class="comment"> * @param id_col_name                 The name of the column containing id of each point.  </span>
 <a name="l04125"></a>04125 <span class="comment"> * @param class_col_name              The name of the column containing correct class </span>
 <a name="l04126"></a>04126 <span class="comment"> *                                    of each point.  </span>
 <a name="l04127"></a>04127 <span class="comment"> * @param confidence_level            A statistical confidence interval of the </span>
 <a name="l04128"></a>04128 <span class="comment"> *                                    resubstitution error.  </span>
 <a name="l04129"></a>04129 <span class="comment"> * @param how2handle_missing_value    The way to handle missing value. The valid value </span>
 <a name="l04130"></a>04130 <span class="comment"> *                                    is &#39;explicit&#39; or &#39;ignore&#39;.</span>
 <a name="l04131"></a>04131 <span class="comment"> * @param max_tree_depth              Maximum decision tree depth.  </span>
 <a name="l04132"></a>04132 <span class="comment"> * @param sampling_percentage         The percentage of records sampled to train a tree.</span>
 <a name="l04133"></a>04133 <span class="comment"> *                                    If it&#39;s NULL, 0.632 bootstrap will be used</span>
 <a name="l04134"></a>04134 <span class="comment"> * @param sampling_needed             Whether enabling the sampling functionality.  </span>
 <a name="l04135"></a>04135 <span class="comment"> * @param node_prune_threshold        Specifies the minimum number of samples required </span>
 <a name="l04136"></a>04136 <span class="comment"> *                                    in a child node.  </span>
 <a name="l04137"></a>04137 <span class="comment"> * @param node_split_threshold        Specifies the minimum number of samples required </span>
 <a name="l04138"></a>04138 <span class="comment"> *                                    in a node in order for a further split   </span>
 <a name="l04139"></a>04139 <span class="comment"> *                                    to be possible.  </span>
 <a name="l04140"></a>04140 <span class="comment"> * @param error_msg                   The reported error message when the result table</span>
 <a name="l04141"></a>04141 <span class="comment"> *                                    name is invalid.</span>
 <a name="l04142"></a>04142 <span class="comment"> * @param verbosity                   &gt; 0 means this function runs in verbose mode. </span>
 <a name="l04143"></a>04143 <span class="comment"> *</span>
 <a name="l04144"></a>04144 <span class="comment"> * @return An instance of __train_result.</span>
 <a name="l04145"></a>04145 <span class="comment"> *</span>
 <a name="l04146"></a>04146 <span class="comment"> */</span>
 <a name="l04147"></a>04147 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__encode_and_train
 <a name="l04148"></a>04148     (
 <a name="l04149"></a>04149     dt_algo_name                TEXT,
 <a name="l04150"></a>04150     split_criterion             TEXT,
 <a name="l04151"></a>04151     num_trees                   INT,
 <a name="l04152"></a>04152     features_per_node           INT,
 <a name="l04153"></a>04153     training_table_name         TEXT,
 <a name="l04154"></a>04154     validation_table_name       TEXT,
 <a name="l04155"></a>04155     tree_table_name             TEXT,
 <a name="l04156"></a>04156     continuous_feature_names    TEXT,
 <a name="l04157"></a>04157     feature_col_names           TEXT,
 <a name="l04158"></a>04158     id_col_name                 TEXT,
 <a name="l04159"></a>04159     class_col_name              TEXT,
 <a name="l04160"></a>04160     confidence_level            FLOAT8,
 <a name="l04161"></a>04161     how2handle_missing_value    TEXT,
 <a name="l04162"></a>04162     max_tree_depth              INT,
 <a name="l04163"></a>04163     sampling_percentage         FLOAT8,
 <a name="l04164"></a>04164     sampling_needed             BOOL,
 <a name="l04165"></a>04165     node_prune_threshold        FLOAT8,
 <a name="l04166"></a>04166     node_split_threshold        FLOAT8,
 <a name="l04167"></a>04167     error_msg                   TEXT,
 <a name="l04168"></a>04168     verbosity                   INT
 <a name="l04169"></a>04169     )
 <a name="l04170"></a>04170 RETURNS RECORD AS $$
 <a name="l04171"></a>04171 DECLARE
 <a name="l04172"></a>04172     table_names             TEXT[]; -- 1: encoded table; 2: meta table
 <a name="l04173"></a>04173     h2hmv_routine_id        INT := 1;
 <a name="l04174"></a>04174     h2hmv_routine_name      TEXT;
 <a name="l04175"></a>04175     n_fids                  INT;
 <a name="l04176"></a>04176     curstmt                 TEXT;
 <a name="l04177"></a>04177     enc_tree_name           TEXT;
 <a name="l04178"></a>04178     cont_feature_col_names  TEXT[];
 <a name="l04179"></a>04179     feature_name_array      TEXT[];
 <a name="l04180"></a>04180     train_rs                MADLIB_SCHEMA.__train_result;
 <a name="l04181"></a>04181 BEGIN
 <a name="l04182"></a>04182     cont_feature_col_names  = MADLIB_SCHEMA.__csvstr_to_array(continuous_feature_names);
 <a name="l04183"></a>04183     feature_name_array      = MADLIB_SCHEMA.__csvstr_to_array(feature_col_names);
 <a name="l04184"></a>04184
 <a name="l04185"></a>04185     -- if the training table is an valid encoded table, then we retrieve
 <a name="l04186"></a>04186     -- the relevant information from training_info table directly.
 <a name="l04187"></a>04187     IF (MADLIB_SCHEMA.__is_valid_enc_table(training_table_name)) THEN
 <a name="l04188"></a>04188         enc_tree_name       = MADLIB_SCHEMA.__get_tree_table_name
 <a name="l04189"></a>04189                                     (training_table_name);
 <a name="l04190"></a>04190         table_names[1]      = training_table_name;
 <a name="l04191"></a>04191         table_names[2]      = MADLIB_SCHEMA.__get_metatable_name(enc_tree_name);
 <a name="l04192"></a>04192         h2hmv_routine_name  = MADLIB_SCHEMA.__get_routine_name(enc_tree_name);
 <a name="l04193"></a>04193         IF (h2hmv_routine_name = &#39;ignore&#39;) THEN
 <a name="l04194"></a>04194             h2hmv_routine_id = 1;
 <a name="l04195"></a>04195         ELSE
 <a name="l04196"></a>04196             h2hmv_routine_id = 2;
 <a name="l04197"></a>04197         END IF;
 <a name="l04198"></a>04198
 <a name="l04199"></a>04199         -- validate the metatable
 <a name="l04200"></a>04200         PERFORM MADLIB_SCHEMA.__validate_metatable(table_names[2]);
 <a name="l04201"></a>04201
 <a name="l04202"></a>04202         n_fids = MADLIB_SCHEMA.__num_of_feature(table_names[2]);
 <a name="l04203"></a>04203         PERFORM MADLIB_SCHEMA.__assert
 <a name="l04204"></a>04204             (
 <a name="l04205"></a>04205                 features_per_node IS NULL OR
 <a name="l04206"></a>04206                 n_fids &gt;= features_per_node,
 <a name="l04207"></a>04207                 &#39;the value of features_per_node must be less than or equal to the total number &#39; ||
 <a name="l04208"></a>04208                 &#39;of features of the training table&#39;
 <a name="l04209"></a>04209            );
 <a name="l04210"></a>04210         -- create tree table and auxiliary tables
 <a name="l04211"></a>04211         -- so that we can get the schema name of the table
 <a name="l04212"></a>04212         PERFORM MADLIB_SCHEMA.__create_tree_tables(tree_table_name);
 <a name="l04213"></a>04213     ELSE
 <a name="l04214"></a>04214         -- the provided columns must be in the training table
 <a name="l04215"></a>04215         PERFORM MADLIB_SCHEMA.__check_training_table
 <a name="l04216"></a>04216             (
 <a name="l04217"></a>04217                 training_table_name,
 <a name="l04218"></a>04218                 cont_feature_col_names,
 <a name="l04219"></a>04219                 feature_name_array,
 <a name="l04220"></a>04220                 id_col_name,
 <a name="l04221"></a>04221                 class_col_name,
 <a name="l04222"></a>04222                 features_per_node
 <a name="l04223"></a>04223             );
 <a name="l04224"></a>04224
 <a name="l04225"></a>04225         h2hmv_routine_name = btrim(how2handle_missing_value, &#39; &#39;);
 <a name="l04226"></a>04226         IF (h2hmv_routine_name = &#39;ignore&#39;) THEN
 <a name="l04227"></a>04227             h2hmv_routine_id = 1;
 <a name="l04228"></a>04228         ELSE
 <a name="l04229"></a>04229             h2hmv_routine_id = 2;
 <a name="l04230"></a>04230         END IF;
 <a name="l04231"></a>04231
 <a name="l04232"></a>04232         -- create tree table and auxiliary tables
 <a name="l04233"></a>04233         -- so that we can get the schema name of the table
 <a name="l04234"></a>04234         PERFORM MADLIB_SCHEMA.__create_tree_tables(tree_table_name);
 <a name="l04235"></a>04235
 <a name="l04236"></a>04236         -- encode the training table
 <a name="l04237"></a>04237         table_names = MADLIB_SCHEMA.__gen_enc_meta_names(tree_table_name, error_msg);
 <a name="l04238"></a>04238         PERFORM MADLIB_SCHEMA.__encode_table
 <a name="l04239"></a>04239             (
 <a name="l04240"></a>04240                 training_table_name,
 <a name="l04241"></a>04241                 lower(id_col_name),
 <a name="l04242"></a>04242                 feature_name_array,
 <a name="l04243"></a>04243                 lower(class_col_name),
 <a name="l04244"></a>04244                 cont_feature_col_names,
 <a name="l04245"></a>04245                 table_names[1],
 <a name="l04246"></a>04246                 table_names[2],
 <a name="l04247"></a>04247                 h2hmv_routine_id,
 <a name="l04248"></a>04248                 verbosity
 <a name="l04249"></a>04249             );
 <a name="l04250"></a>04250         n_fids = MADLIB_SCHEMA.__num_of_feature(table_names[2]);
 <a name="l04251"></a>04251     END IF;
 <a name="l04252"></a>04252
 <a name="l04253"></a>04253     IF (sampling_needed) THEN
 <a name="l04254"></a>04254         IF (features_per_node IS NULL) THEN
 <a name="l04255"></a>04255             n_fids = round(sqrt(n_fids) - 0.5)::INT + 1;
 <a name="l04256"></a>04256         ELSE
 <a name="l04257"></a>04257             n_fids = features_per_node;
 <a name="l04258"></a>04258         END IF;
 <a name="l04259"></a>04259     END IF;
 <a name="l04260"></a>04260
 <a name="l04261"></a>04261     IF (verbosity &gt; 0) THEN
 <a name="l04262"></a>04262         RAISE INFO &#39;features_per_node: %&#39;, n_fids;
 <a name="l04263"></a>04263     END IF;
 <a name="l04264"></a>04264
 <a name="l04265"></a>04265     -- insert data to the training_info table
 <a name="l04266"></a>04266     PERFORM MADLIB_SCHEMA.__insert_into_traininginfo
 <a name="l04267"></a>04267         (
 <a name="l04268"></a>04268             dt_algo_name,
 <a name="l04269"></a>04269             tree_table_name,
 <a name="l04270"></a>04270             training_table_name,
 <a name="l04271"></a>04271             table_names[2],
 <a name="l04272"></a>04272             table_names[1],
 <a name="l04273"></a>04273             validation_table_name,
 <a name="l04274"></a>04274             h2hmv_routine_name,
 <a name="l04275"></a>04275             split_criterion,
 <a name="l04276"></a>04276             sampling_percentage,
 <a name="l04277"></a>04277             n_fids,
 <a name="l04278"></a>04278             num_trees
 <a name="l04279"></a>04279         );
 <a name="l04280"></a>04280
 <a name="l04281"></a>04281     -- call the tree grow engine
 <a name="l04282"></a>04282     train_rs = MADLIB_SCHEMA.__train_tree
 <a name="l04283"></a>04283         (
 <a name="l04284"></a>04284             split_criterion,
 <a name="l04285"></a>04285             num_trees,
 <a name="l04286"></a>04286             n_fids ,
 <a name="l04287"></a>04287             table_names[1],
 <a name="l04288"></a>04288             table_names[2],
 <a name="l04289"></a>04289             tree_table_name,
 <a name="l04290"></a>04290             validation_table_name,
 <a name="l04291"></a>04291             &#39;id&#39;,
 <a name="l04292"></a>04292             &#39;class&#39;,
 <a name="l04293"></a>04293             confidence_level,
 <a name="l04294"></a>04294             max_tree_depth,
 <a name="l04295"></a>04295             sampling_percentage,
 <a name="l04296"></a>04296             node_prune_threshold,
 <a name="l04297"></a>04297             node_split_threshold,
 <a name="l04298"></a>04298             sampling_needed,
 <a name="l04299"></a>04299             h2hmv_routine_id,
 <a name="l04300"></a>04300             verbosity
 <a name="l04301"></a>04301         );
 <a name="l04302"></a>04302
 <a name="l04303"></a>04303     RETURN train_rs;
 <a name="l04304"></a>04304 END
 <a name="l04305"></a>04305 $$ LANGUAGE PLPGSQL STABLE;
 </pre></div></div>
 </div>
   <div id="nav-path" class="navpath">
     <ul>
       <li class="navelem"><a class="el" href="dt_8sql__in.html">dt.sql_in</a>      </li>
 <!-- window showing the filter options -->
 <div id="MSearchSelectWindow"
      onmouseover="return searchBox.OnSearchSelectShow()"
      onmouseout="return searchBox.OnSearchSelectHide()"
      onkeydown="return searchBox.OnSearchSelectKey(event)">
 <a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(0)"><span class="SelectionMark">&#160;</span>All</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(1)"><span class="SelectionMark">&#160;</span>Files</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(2)"><span class="SelectionMark">&#160;</span>Functions</a></div>

 <!-- iframe showing the search results (closed by default) -->
 <div id="MSearchResultsWindow">
 <iframe src="javascript:void(0)" frameborder="0"
         name="MSearchResults" id="MSearchResults">
 </iframe>
 </div>


     <li class="footer">Generated on Fri May 10 2013 01:37:13 for MADlib by
     <a href="http://www.doxygen.org/index.html">
     <img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.7.5.1 </li>
    </ul>
  </div>


 </body>
 </html>