blob: b55977cfd43765f679571ea9b8eaf19066413836 [file] [log] [blame]
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
<title>MADlib: dt.sql_in Source File</title>
<link href="tabs.css" rel="stylesheet" type="text/css"/>
<link href="doxygen.css" rel="stylesheet" type="text/css" />
<link href="navtree.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript" src="jquery.js"></script>
<script type="text/javascript" src="resize.js"></script>
<script type="text/javascript" src="navtree.js"></script>
<script type="text/javascript">
$(document).ready(initResizable);
</script>
<link href="search/search.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript" src="search/search.js"></script>
<script type="text/javascript">
$(document).ready(function() { searchBox.OnSelectItem(0); });
</script>
<script src="../mathjax/MathJax.js">
MathJax.Hub.Config({
extensions: ["tex2jax.js", "TeX/AMSmath.js", "TeX/AMSsymbols.js"],
jax: ["input/TeX","output/HTML-CSS"],
});
</script>
</head>
<body>
<div id="top"><!-- do not remove this div! -->
<div id="titlearea">
<table cellspacing="0" cellpadding="0">
<tbody>
<tr style="height: 56px;">
<td style="padding-left: 0.5em;">
<div id="projectname">MADlib
&#160;<span id="projectnumber">0.7</span> <span style="font-size:10pt; font-style:italic"><a href="../latest/./dt_8sql__in_source.html"> A newer version is available</a></span>
</div>
<div id="projectbrief">User Documentation</div>
</td>
</tr>
</tbody>
</table>
</div>
<!-- Generated by Doxygen 1.7.5.1 -->
<script type="text/javascript">
var searchBox = new SearchBox("searchBox", "search",false,'Search');
</script>
<script type="text/javascript" src="dynsections.js"></script>
<div id="navrow1" class="tabs">
<ul class="tablist">
<li><a href="index.html"><span>Main&#160;Page</span></a></li>
<li><a href="modules.html"><span>Modules</span></a></li>
<li class="current"><a href="files.html"><span>Files</span></a></li>
<li>
<div id="MSearchBox" class="MSearchBoxInactive">
<span class="left">
<img id="MSearchSelect" src="search/mag_sel.png"
onmouseover="return searchBox.OnSearchSelectShow()"
onmouseout="return searchBox.OnSearchSelectHide()"
alt=""/>
<input type="text" id="MSearchField" value="Search" accesskey="S"
onfocus="searchBox.OnSearchFieldFocus(true)"
onblur="searchBox.OnSearchFieldFocus(false)"
onkeyup="searchBox.OnSearchFieldChange(event)"/>
</span><span class="right">
<a id="MSearchClose" href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" border="0" src="search/close.png" alt=""/></a>
</span>
</div>
</li>
</ul>
</div>
<div id="navrow2" class="tabs2">
<ul class="tablist">
<li><a href="files.html"><span>File&#160;List</span></a></li>
<li><a href="globals.html"><span>File&#160;Members</span></a></li>
</ul>
</div>
</div>
<div id="side-nav" class="ui-resizable side-nav-resizable">
<div id="nav-tree">
<div id="nav-tree-contents">
</div>
</div>
<div id="splitbar" style="-moz-user-select:none;"
class="ui-resizable-handle">
</div>
</div>
<script type="text/javascript">
initNavTree('dt_8sql__in.html','');
</script>
<div id="doc-content">
<div class="header">
<div class="headertitle">
<div class="title">dt.sql_in</div> </div>
</div>
<div class="contents">
<a href="dt_8sql__in.html">Go to the documentation of this file.</a><div class="fragment"><pre class="fragment"><a name="l00001"></a>00001 <span class="comment">/* ----------------------------------------------------------------------- */</span><span class="comment">/**</span>
<a name="l00002"></a>00002 <span class="comment"> *</span>
<a name="l00003"></a>00003 <span class="comment"> * @file dt.sql_in</span>
<a name="l00004"></a>00004 <span class="comment"> *</span>
<a name="l00005"></a>00005 <span class="comment"> * @brief the common functions written in PL/PGSQL shared by C4.5 and RF</span>
<a name="l00006"></a>00006 <span class="comment"> * @date April 5, 2012</span>
<a name="l00007"></a>00007 <span class="comment"> *</span>
<a name="l00008"></a>00008 <span class="comment"> */</span><span class="comment">/* ----------------------------------------------------------------------- */</span>
<a name="l00009"></a>00009
<a name="l00010"></a>00010 m4_include(`SQLCommon.m4<span class="stringliteral">&#39;)</span>
<a name="l00011"></a>00011 <span class="stringliteral"></span>
<a name="l00012"></a>00012 <span class="stringliteral">/* Own macro definitions */</span>
<a name="l00013"></a>00013 <span class="stringliteral">m4_ifelse(</span>
<a name="l00014"></a>00014 <span class="stringliteral"> m4_eval(</span>
<a name="l00015"></a>00015 <span class="stringliteral"> m4_ifdef(`__GREENPLUM__&#39;</span>, 1, 0) &amp;&amp;
<a name="l00016"></a>00016 __DBMS_VERSION_MAJOR__ * 100 + __DBMS_VERSION_MINOR__ &lt; 401
<a name="l00017"></a>00017 ), 1,
<a name="l00018"></a>00018 `m4_define(`__GREENPLUM_PRE_4_1__<span class="charliteral">&#39;)&#39;</span>
<a name="l00019"></a>00019 )
<a name="l00020"></a>00020 m4_ifelse(
<a name="l00021"></a>00021 m4_eval(
<a name="l00022"></a>00022 m4_ifdef(`__POSTGRESQL__<span class="stringliteral">&#39;, 1, 0) &amp;&amp;</span>
<a name="l00023"></a>00023 <span class="stringliteral"> __DBMS_VERSION_MAJOR__ &lt; 9</span>
<a name="l00024"></a>00024 <span class="stringliteral"> ), 1,</span>
<a name="l00025"></a>00025 <span class="stringliteral"> `m4_define(`__POSTGRESQL_PRE_9_0__&#39;</span>)<span class="stringliteral">&#39;</span>
<a name="l00026"></a>00026 <span class="stringliteral">)</span>
<a name="l00027"></a>00027 <span class="stringliteral"></span>
<a name="l00028"></a>00028 <span class="stringliteral">m4_ifelse(</span>
<a name="l00029"></a>00029 <span class="stringliteral"> m4_eval(</span>
<a name="l00030"></a>00030 <span class="stringliteral"> m4_ifdef(`__GREENPLUM__&#39;</span>, 1, 0) &amp;&amp;
<a name="l00031"></a>00031 __DBMS_VERSION_MAJOR__ * 10000 +
<a name="l00032"></a>00032 __DBMS_VERSION_MINOR__ * 100 +
<a name="l00033"></a>00033 __DBMS_VERSION_PATCH__ &gt;= 40201
<a name="l00034"></a>00034 ), 1,
<a name="l00035"></a>00035 `m4_define(`__GREENPLUM_GE_4_2_1__<span class="charliteral">&#39;)&#39;</span>
<a name="l00036"></a>00036 )
<a name="l00037"></a>00037
<a name="l00038"></a>00038 <span class="comment">/*</span>
<a name="l00039"></a>00039 <span class="comment"> * This is a global table to store information for various tree training.</span>
<a name="l00040"></a>00040 <span class="comment"> *</span>
<a name="l00041"></a>00041 <span class="comment"> * classifier_name The name of the classifier, e.g, &#39;C4.5&#39; or &#39;RF&#39;.</span>
<a name="l00042"></a>00042 <span class="comment"> * result_table_oid The OID of the result table.</span>
<a name="l00043"></a>00043 <span class="comment"> * training_table_oid The OID of the training table.</span>
<a name="l00044"></a>00044 <span class="comment"> * training_metatable_oid The OID of the metadata table.</span>
<a name="l00045"></a>00045 <span class="comment"> * training_encoded_table_oid The OID of the encoded table.</span>
<a name="l00046"></a>00046 <span class="comment"> * validation_table_oid The OID of the validation table.</span>
<a name="l00047"></a>00047 <span class="comment"> * how2handle_missing_value The approach name to handle missing value.</span>
<a name="l00048"></a>00048 <span class="comment"> * split_criterion The name of the split criterion for this training.</span>
<a name="l00049"></a>00049 <span class="comment"> * sampling_percentage The sampling percentage for training each tree.</span>
<a name="l00050"></a>00050 <span class="comment"> * num_feature_chosen The number of features will be chosen to find best split.</span>
<a name="l00051"></a>00051 <span class="comment"> * num_trees The number of trees will be grow in training.</span>
<a name="l00052"></a>00052 <span class="comment"> *</span>
<a name="l00053"></a>00053 <span class="comment"> */</span>
<a name="l00054"></a>00054 DROP TABLE IF EXISTS MADLIB_SCHEMA.training_info;
<a name="l00055"></a>00055 CREATE TABLE MADLIB_SCHEMA.training_info
<a name="l00056"></a>00056 (
<a name="l00057"></a>00057 classifier_name TEXT NOT NULL,
<a name="l00058"></a>00058 result_table_oid OID NOT NULL,
<a name="l00059"></a>00059 training_table_oid OID,
<a name="l00060"></a>00060 training_metatable_oid OID,
<a name="l00061"></a>00061 training_encoded_table_oid OID,
<a name="l00062"></a>00062 validation_table_oid OID,
<a name="l00063"></a>00063 how2handle_missing_value TEXT,
<a name="l00064"></a>00064 split_criterion TEXT,
<a name="l00065"></a>00065 sampling_percentage FLOAT,
<a name="l00066"></a>00066 num_feature_chosen INT,
<a name="l00067"></a>00067 num_trees INT,
<a name="l00068"></a>00068 PRIMARY KEY (result_table_oid)
<a name="l00069"></a>00069 ) m4_ifdef(`__GREENPLUM__&#39;, `DISTRIBUTED BY (result_table_oid)&#39;);
<a name="l00070"></a>00070 GRANT SELECT, INSERT, UPDATE, DELETE ON MADLIB_SCHEMA.training_info TO PUBLIC;
<a name="l00071"></a>00071
<a name="l00072"></a>00072
<a name="l00073"></a>00073 <span class="comment">/*</span>
<a name="l00074"></a>00074 <span class="comment"> * @brief Remove the trained tree from training info table. </span>
<a name="l00075"></a>00075 <span class="comment"> *</span>
<a name="l00076"></a>00076 <span class="comment"> * @param tree_table The full name of the tree table.</span>
<a name="l00077"></a>00077 <span class="comment"> *</span>
<a name="l00078"></a>00078 <span class="comment"> */</span>
<a name="l00079"></a>00079 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__delete_traininginfo
<a name="l00080"></a>00080 (
<a name="l00081"></a>00081 tree_table TEXT
<a name="l00082"></a>00082 )
<a name="l00083"></a>00083 RETURNS <span class="keywordtype">void</span> AS $$
<a name="l00084"></a>00084 BEGIN
<a name="l00085"></a>00085 DELETE FROM MADLIB_SCHEMA.training_info
<a name="l00086"></a>00086 WHERE result_table_oid = tree_table::regclass;
<a name="l00087"></a>00087 end
<a name="l00088"></a>00088 $$ LANGUAGE PLPGSQL;
<a name="l00089"></a>00089
<a name="l00090"></a>00090
<a name="l00091"></a>00091 <span class="comment">/*</span>
<a name="l00092"></a>00092 <span class="comment"> * @brief Insert the trained tree into training info table. </span>
<a name="l00093"></a>00093 <span class="comment"> *</span>
<a name="l00094"></a>00094 <span class="comment"> * @param classifier_table_name The name of the classifier.</span>
<a name="l00095"></a>00095 <span class="comment"> * @param result_table_name The full name of the training result table.</span>
<a name="l00096"></a>00096 <span class="comment"> * @param training_table_name The full name of the training table.</span>
<a name="l00097"></a>00097 <span class="comment"> * @param training_metatable_name The full name of metatable.</span>
<a name="l00098"></a>00098 <span class="comment"> * @param training_encoded_table_name The full name of the encoded table. </span>
<a name="l00099"></a>00099 <span class="comment"> * @param validation_table_name The full name of the validation table.</span>
<a name="l00100"></a>00100 <span class="comment"> * @param how2handle_missing_value The name of the routine to process unknown </span>
<a name="l00101"></a>00101 <span class="comment"> * values.</span>
<a name="l00102"></a>00102 <span class="comment"> * @param split_criterion The name of split criterion.</span>
<a name="l00103"></a>00103 <span class="comment"> * @param sampling_percentage The percentage of bootstrap samples size in </span>
<a name="l00104"></a>00104 <span class="comment"> * training dataset.</span>
<a name="l00105"></a>00105 <span class="comment"> * @param num_features_chosen The number of features to split on each tree</span>
<a name="l00106"></a>00106 <span class="comment"> * node. </span>
<a name="l00107"></a>00107 <span class="comment"> * @param num_trees The number of trees after completed the </span>
<a name="l00108"></a>00108 <span class="comment"> * training process.</span>
<a name="l00109"></a>00109 <span class="comment"> * </span>
<a name="l00110"></a>00110 <span class="comment"> */</span>
<a name="l00111"></a>00111 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__insert_into_traininginfo
<a name="l00112"></a>00112 (
<a name="l00113"></a>00113 classifier_table_name TEXT,
<a name="l00114"></a>00114 result_table_name TEXT,
<a name="l00115"></a>00115 training_table_name TEXT,
<a name="l00116"></a>00116 training_metatable_name TEXT,
<a name="l00117"></a>00117 training_encoded_table_name TEXT,
<a name="l00118"></a>00118 validation_table_name TEXT,
<a name="l00119"></a>00119 how2handle_missing_value TEXT,
<a name="l00120"></a>00120 split_criterion TEXT,
<a name="l00121"></a>00121 sampling_percentage FLOAT,
<a name="l00122"></a>00122 num_features_chosen INT,
<a name="l00123"></a>00123 num_trees INT
<a name="l00124"></a>00124 )
<a name="l00125"></a>00125 RETURNS <span class="keywordtype">void</span> AS $$
<a name="l00126"></a>00126 BEGIN
<a name="l00127"></a>00127 INSERT INTO MADLIB_SCHEMA.training_info VALUES
<a name="l00128"></a>00128 (
<a name="l00129"></a>00129 classifier_table_name,
<a name="l00130"></a>00130 result_table_name::regclass,
<a name="l00131"></a>00131 training_table_name::regclass,
<a name="l00132"></a>00132 training_metatable_name::regclass,
<a name="l00133"></a>00133 training_encoded_table_name::regclass,
<a name="l00134"></a>00134 validation_table_name::regclass,
<a name="l00135"></a>00135 how2handle_missing_value,
<a name="l00136"></a>00136 split_criterion,
<a name="l00137"></a>00137 sampling_percentage,
<a name="l00138"></a>00138 num_features_chosen,
<a name="l00139"></a>00139 num_trees
<a name="l00140"></a>00140 );
<a name="l00141"></a>00141 END
<a name="l00142"></a>00142 $$ LANGUAGE PLPGSQL;
<a name="l00143"></a>00143
<a name="l00144"></a>00144
<a name="l00145"></a>00145 <span class="comment">/*</span>
<a name="l00146"></a>00146 <span class="comment"> * @brief Get the name of the encoded table. </span>
<a name="l00147"></a>00147 <span class="comment"> *</span>
<a name="l00148"></a>00148 <span class="comment"> * @param tree_table The full name of the tree table.</span>
<a name="l00149"></a>00149 <span class="comment"> *</span>
<a name="l00150"></a>00150 <span class="comment"> * @return The full name of the encoded table.</span>
<a name="l00151"></a>00151 <span class="comment"> *</span>
<a name="l00152"></a>00152 <span class="comment"> */</span>
<a name="l00153"></a>00153 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_encode_table_name
<a name="l00154"></a>00154 (
<a name="l00155"></a>00155 tree_table TEXT
<a name="l00156"></a>00156 )
<a name="l00157"></a>00157 RETURNS TEXT AS $$
<a name="l00158"></a>00158 DECLARE
<a name="l00159"></a>00159 encoded_table_name TEXT := &#39;&#39;;
<a name="l00160"></a>00160 BEGIN
<a name="l00161"></a>00161 SELECT MADLIB_SCHEMA.__regclass_to_text(training_encoded_table_oid)
<a name="l00162"></a>00162 FROM MADLIB_SCHEMA.training_info
<a name="l00163"></a>00163 WHERE result_table_oid = tree_table::regclass
<a name="l00164"></a>00164 INTO encoded_table_name;
<a name="l00165"></a>00165
<a name="l00166"></a>00166 RETURN encoded_table_name;
<a name="l00167"></a>00167 END
<a name="l00168"></a>00168 $$ LANGUAGE PLPGSQL STABLE;
<a name="l00169"></a>00169
<a name="l00170"></a>00170
<a name="l00171"></a>00171 <span class="comment">/*</span>
<a name="l00172"></a>00172 <span class="comment"> * @brief Test if the given table is a valid encoded one. </span>
<a name="l00173"></a>00173 <span class="comment"> * A valid encoded table has the following characteristic:</span>
<a name="l00174"></a>00174 <span class="comment"> * + Its OID is in the column &quot;training_encoded_table_oid&quot;</span>
<a name="l00175"></a>00175 <span class="comment"> * of training_info table.</span>
<a name="l00176"></a>00176 <span class="comment"> * + It has 5 columns, whose names are id, fid, fval,</span>
<a name="l00177"></a>00177 <span class="comment"> * is_cont and class.</span>
<a name="l00178"></a>00178 <span class="comment"> * + The types of the 5 columns are BIGINT, INT, FLOAT8</span>
<a name="l00179"></a>00179 <span class="comment"> * BOOL and INT.</span>
<a name="l00180"></a>00180 <span class="comment"> *</span>
<a name="l00181"></a>00181 <span class="comment"> * @param enc_tbl_name The full name of the encoded table.</span>
<a name="l00182"></a>00182 <span class="comment"> *</span>
<a name="l00183"></a>00183 <span class="comment"> * @return Ture if the given table is a valid encoded one.</span>
<a name="l00184"></a>00184 <span class="comment"> * False if it&#39;s an invalid encoded table.</span>
<a name="l00185"></a>00185 <span class="comment"> *</span>
<a name="l00186"></a>00186 <span class="comment"> */</span>
<a name="l00187"></a>00187 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__is_valid_enc_table
<a name="l00188"></a>00188 (
<a name="l00189"></a>00189 enc_tbl_name TEXT
<a name="l00190"></a>00190 )
<a name="l00191"></a>00191 RETURNS BOOL AS $$
<a name="l00192"></a>00192 DECLARE
<a name="l00193"></a>00193 num_enc_table INT;
<a name="l00194"></a>00194 num_cols INT;
<a name="l00195"></a>00195 ret BOOL := &#39;f&#39;::BOOL;
<a name="l00196"></a>00196 BEGIN
<a name="l00197"></a>00197 -- test if the table is in the training_info table
<a name="l00198"></a>00198 SELECT count(*)
<a name="l00199"></a>00199 FROM MADLIB_SCHEMA.training_info
<a name="l00200"></a>00200 WHERE MADLIB_SCHEMA.__regclass_to_text(training_encoded_table_oid) =
<a name="l00201"></a>00201 enc_tbl_name
<a name="l00202"></a>00202 INTO num_enc_table;
<a name="l00203"></a>00203
<a name="l00204"></a>00204 -- test if the name and the type of a column are valid or not
<a name="l00205"></a>00205 SELECT count(*)
<a name="l00206"></a>00206 FROM pg_attribute
<a name="l00207"></a>00207 WHERE attrelid= enc_tbl_name::regclass::oid AND
<a name="l00208"></a>00208 attnum &gt; 0 AND
<a name="l00209"></a>00209 not attisdropped AND
<a name="l00210"></a>00210 attname in (&#39;<span class="keywordtype">id</span>&#39;, &#39;fid&#39;, &#39;fval&#39;, &#39;is_cont&#39;, &#39;class&#39;) AND
<a name="l00211"></a>00211 atttypid in (&#39;int8&#39;::regtype, &#39;<span class="keywordtype">int</span>&#39;::regtype, &#39;float8&#39;::regtype,
<a name="l00212"></a>00212 &#39;<span class="keywordtype">bool</span>&#39;::regtype, &#39;<span class="keywordtype">int</span>&#39;::regtype)
<a name="l00213"></a>00213 INTO num_cols;
<a name="l00214"></a>00214
<a name="l00215"></a>00215 IF ((num_enc_table &gt; 0) AND (num_cols = 5)) THEN
<a name="l00216"></a>00216 ret = &#39;t&#39;::BOOL;
<a name="l00217"></a>00217 END IF;
<a name="l00218"></a>00218
<a name="l00219"></a>00219 RETURN ret;
<a name="l00220"></a>00220 END
<a name="l00221"></a>00221 $$ LANGUAGE PLPGSQL;
<a name="l00222"></a>00222
<a name="l00223"></a>00223
<a name="l00224"></a>00224 <span class="comment">/*</span>
<a name="l00225"></a>00225 <span class="comment"> * @brief Get the meta table name by the tree table name. </span>
<a name="l00226"></a>00226 <span class="comment"> *</span>
<a name="l00227"></a>00227 <span class="comment"> * @param tree_table The full name of the tree table.</span>
<a name="l00228"></a>00228 <span class="comment"> * </span>
<a name="l00229"></a>00229 <span class="comment"> * @return The full name of the metatable.</span>
<a name="l00230"></a>00230 <span class="comment"> *</span>
<a name="l00231"></a>00231 <span class="comment"> */</span>
<a name="l00232"></a>00232 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_metatable_name
<a name="l00233"></a>00233 (
<a name="l00234"></a>00234 tree_table TEXT
<a name="l00235"></a>00235 )
<a name="l00236"></a>00236 RETURNS TEXT AS $$
<a name="l00237"></a>00237 DECLARE
<a name="l00238"></a>00238 metatable_name TEXT := &#39;&#39;;
<a name="l00239"></a>00239 BEGIN
<a name="l00240"></a>00240
<a name="l00241"></a>00241 PERFORM MADLIB_SCHEMA.__assert_table
<a name="l00242"></a>00242 (
<a name="l00243"></a>00243 tree_table::TEXT,
<a name="l00244"></a>00244 &#39;t&#39;::BOOL
<a name="l00245"></a>00245 );
<a name="l00246"></a>00246
<a name="l00247"></a>00247 PERFORM MADLIB_SCHEMA.__assert_table
<a name="l00248"></a>00248 (
<a name="l00249"></a>00249 &#39;MADLIB_SCHEMA.training_info&#39;::TEXT,
<a name="l00250"></a>00250 &#39;t&#39;::BOOL
<a name="l00251"></a>00251 );
<a name="l00252"></a>00252
<a name="l00253"></a>00253 SELECT MADLIB_SCHEMA.__regclass_to_text(training_metatable_oid)
<a name="l00254"></a>00254 FROM MADLIB_SCHEMA.training_info
<a name="l00255"></a>00255 WHERE result_table_oid = tree_table::regclass
<a name="l00256"></a>00256 INTO metatable_name;
<a name="l00257"></a>00257
<a name="l00258"></a>00258 RETURN metatable_name;
<a name="l00259"></a>00259 END
<a name="l00260"></a>00260 $$ LANGUAGE PLPGSQL;
<a name="l00261"></a>00261
<a name="l00262"></a>00262
<a name="l00263"></a>00263 <span class="comment">/*</span>
<a name="l00264"></a>00264 <span class="comment"> * @brief Get the unknown values processing routine id. </span>
<a name="l00265"></a>00265 <span class="comment"> *</span>
<a name="l00266"></a>00266 <span class="comment"> * @param tree_table The full name of the tree table.</span>
<a name="l00267"></a>00267 <span class="comment"> *</span>
<a name="l00268"></a>00268 <span class="comment"> * @return The encoded missing value processing routine id.</span>
<a name="l00269"></a>00269 <span class="comment"> *</span>
<a name="l00270"></a>00270 <span class="comment"> */</span>
<a name="l00271"></a>00271 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_routine_id
<a name="l00272"></a>00272 (
<a name="l00273"></a>00273 tree_table TEXT
<a name="l00274"></a>00274 )
<a name="l00275"></a>00275 RETURNS INT AS $$
<a name="l00276"></a>00276 DECLARE
<a name="l00277"></a>00277 name TEXT;
<a name="l00278"></a>00278 BEGIN
<a name="l00279"></a>00279 name = MADLIB_SCHEMA.__get_routine_name(tree_table);
<a name="l00280"></a>00280
<a name="l00281"></a>00281 IF (name = &#39;ignore&#39;) THEN
<a name="l00282"></a>00282 RETURN 1;
<a name="l00283"></a>00283 ELSIF (name = &#39;explicit&#39;) THEN
<a name="l00284"></a>00284 RETURN 2;
<a name="l00285"></a>00285 ELSE
<a name="l00286"></a>00286 RAISE EXCEPTION &#39;__get_routine_id: %&#39;, name;
<a name="l00287"></a>00287 END IF;
<a name="l00288"></a>00288
<a name="l00289"></a>00289 END
<a name="l00290"></a>00290 $$ LANGUAGE PLPGSQL;
<a name="l00291"></a>00291
<a name="l00292"></a>00292
<a name="l00293"></a>00293 <span class="comment">/*</span>
<a name="l00294"></a>00294 <span class="comment"> * @brief Get the unknown values processing routine name. </span>
<a name="l00295"></a>00295 <span class="comment"> * The valid routine name is &#39;ignore&#39; or &#39;explicit&#39;.</span>
<a name="l00296"></a>00296 <span class="comment"> *</span>
<a name="l00297"></a>00297 <span class="comment"> * @param tree_table The full name of the tree table.</span>
<a name="l00298"></a>00298 <span class="comment"> *</span>
<a name="l00299"></a>00299 <span class="comment"> * @return The encoded missing value processing routine name.</span>
<a name="l00300"></a>00300 <span class="comment"> *</span>
<a name="l00301"></a>00301 <span class="comment"> */</span>
<a name="l00302"></a>00302 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_routine_name
<a name="l00303"></a>00303 (
<a name="l00304"></a>00304 tree_table TEXT
<a name="l00305"></a>00305 )
<a name="l00306"></a>00306 RETURNS TEXT AS $$
<a name="l00307"></a>00307 DECLARE
<a name="l00308"></a>00308 curstmt TEXT;
<a name="l00309"></a>00309 name TEXT;
<a name="l00310"></a>00310 BEGIN
<a name="l00311"></a>00311 PERFORM MADLIB_SCHEMA.__assert_table
<a name="l00312"></a>00312 (
<a name="l00313"></a>00313 &#39;MADLIB_SCHEMA.training_info&#39;,
<a name="l00314"></a>00314 &#39;t&#39;
<a name="l00315"></a>00315 );
<a name="l00316"></a>00316
<a name="l00317"></a>00317 curstmt = MADLIB_SCHEMA.__format
<a name="l00318"></a>00318 (
<a name="l00319"></a>00319 &#39;SELECT how2handle_missing_value
<a name="l00320"></a>00320 FROM MADLIB_SCHEMA.training_info
<a name="l00321"></a>00321 WHERE result_table_oid = &#39;&#39;%&#39;&#39;::regclass&#39;,
<a name="l00322"></a>00322 tree_table
<a name="l00323"></a>00323 );
<a name="l00324"></a>00324 EXECUTE curstmt INTO name;
<a name="l00325"></a>00325
<a name="l00326"></a>00326 RETURN name;
<a name="l00327"></a>00327 END
<a name="l00328"></a>00328 $$ LANGUAGE PLPGSQL;
<a name="l00329"></a>00329
<a name="l00330"></a>00330
<a name="l00331"></a>00331 <span class="comment">/*</span>
<a name="l00332"></a>00332 <span class="comment"> * @brief Get the name of the tree table from the encoded table name. </span>
<a name="l00333"></a>00333 <span class="comment"> *</span>
<a name="l00334"></a>00334 <span class="comment"> * @param enc_table_name The encoded table name. </span>
<a name="l00335"></a>00335 <span class="comment"> *</span>
<a name="l00336"></a>00336 <span class="comment"> * @return The full name of the tree table.</span>
<a name="l00337"></a>00337 <span class="comment"> *</span>
<a name="l00338"></a>00338 <span class="comment"> */</span>
<a name="l00339"></a>00339 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_tree_table_name
<a name="l00340"></a>00340 (
<a name="l00341"></a>00341 enc_table_name TEXT
<a name="l00342"></a>00342 )
<a name="l00343"></a>00343 RETURNS TEXT AS $$
<a name="l00344"></a>00344 DECLARE
<a name="l00345"></a>00345 curstmt TEXT;
<a name="l00346"></a>00346 name TEXT;
<a name="l00347"></a>00347 BEGIN
<a name="l00348"></a>00348 curstmt = MADLIB_SCHEMA.__format
<a name="l00349"></a>00349 (
<a name="l00350"></a>00350 &#39;SELECT MADLIB_SCHEMA.__regclass_to_text(result_table_oid::regclass)
<a name="l00351"></a>00351 FROM MADLIB_SCHEMA.training_info
<a name="l00352"></a>00352 WHERE training_encoded_table_oid = &#39;&#39;%&#39;&#39;::regclass
<a name="l00353"></a>00353 LIMIT 1&#39;,
<a name="l00354"></a>00354 enc_table_name
<a name="l00355"></a>00355 );
<a name="l00356"></a>00356 EXECUTE curstmt INTO name;
<a name="l00357"></a>00357
<a name="l00358"></a>00358 RETURN name;
<a name="l00359"></a>00359 END
<a name="l00360"></a>00360 $$ LANGUAGE PLPGSQL;
<a name="l00361"></a>00361
<a name="l00362"></a>00362
<a name="l00363"></a>00363 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__best_scv_sfunc
<a name="l00364"></a>00364 (
<a name="l00365"></a>00365 result FLOAT8[], -- intermediate result
<a name="l00366"></a>00366 scv FLOAT8[],
<a name="l00367"></a>00367 fid INT,
<a name="l00368"></a>00368 split_value FLOAT8
<a name="l00369"></a>00369 )
<a name="l00370"></a>00370 RETURNS FLOAT8[]
<a name="l00371"></a>00371 AS &#39;MODULE_PATHNAME&#39;, &#39;dt_best_scv_sfunc&#39;
<a name="l00372"></a>00372 LANGUAGE C STRICT IMMUTABLE;
<a name="l00373"></a>00373
<a name="l00374"></a>00374
<a name="l00375"></a>00375 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__best_scv_prefunc
<a name="l00376"></a>00376 (
<a name="l00377"></a>00377 sfunc1_result FLOAT8[],
<a name="l00378"></a>00378 sfunc2_result FLOAT8[]
<a name="l00379"></a>00379 )
<a name="l00380"></a>00380 RETURNS FLOAT8[]
<a name="l00381"></a>00381 AS &#39;MODULE_PATHNAME&#39;, &#39;dt_best_scv_prefunc&#39;
<a name="l00382"></a>00382 LANGUAGE C STRICT IMMUTABLE;
<a name="l00383"></a>00383
<a name="l00384"></a>00384
<a name="l00385"></a>00385 DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__best_scv_aggr
<a name="l00386"></a>00386 (
<a name="l00387"></a>00387 FLOAT8[], -- scv
<a name="l00388"></a>00388 INT, -- fid
<a name="l00389"></a>00389 FLOAT8 -- split_value
<a name="l00390"></a>00390 ) CASCADE;
<a name="l00391"></a>00391 CREATE
<a name="l00392"></a>00392 AGGREGATE MADLIB_SCHEMA.__best_scv_aggr
<a name="l00393"></a>00393 (
<a name="l00394"></a>00394 FLOAT8[], -- scv
<a name="l00395"></a>00395 INT, -- fid
<a name="l00396"></a>00396 FLOAT8 -- split_value
<a name="l00397"></a>00397 )
<a name="l00398"></a>00398 (
<a name="l00399"></a>00399 SFUNC=MADLIB_SCHEMA.__best_scv_sfunc,
<a name="l00400"></a>00400 m4_ifdef(`__GREENPLUM__&#39;, `prefunc=MADLIB_SCHEMA.__best_scv_prefunc,&#39;)
<a name="l00401"></a>00401 STYPE=FLOAT8[],
<a name="l00402"></a>00402 initcond = &#39;{0, 0, 0, 0, 0, 0, 0}<span class="stringliteral">&#39;</span>
<a name="l00403"></a>00403 <span class="stringliteral">);</span>
<a name="l00404"></a>00404 <span class="stringliteral"></span>
<a name="l00405"></a>00405 <span class="stringliteral"></span>
<a name="l00406"></a>00406 <span class="stringliteral">/*</span>
<a name="l00407"></a>00407 <span class="stringliteral"> * @brief The step function is defined to process each record in the ACS set. </span>
<a name="l00408"></a>00408 <span class="stringliteral"> * The records have this format: </span>
<a name="l00409"></a>00409 <span class="stringliteral"> * {fid, fval, is_cont, split_value, le, total, tid, nid}</span>
<a name="l00410"></a>00410 <span class="stringliteral"> *</span>
<a name="l00411"></a>00411 <span class="stringliteral"> * @param result The array used to keep the best attribute&#39;</span>s info.
<a name="l00412"></a>00412 * @param sc_code The code of the split criterion.
<a name="l00413"></a>00413 * @param is_cont True - The feature is continuous.
<a name="l00414"></a>00414 * False - The feature is discrete.
<a name="l00415"></a>00415 * @param num_class The total number of classes.
<a name="l00416"></a>00416 * @param le_array The le component of the ACS record. le_array[i] is the
<a name="l00417"></a>00417 * number of samples whose <span class="keyword">class </span>code equals to i and
<a name="l00418"></a>00418 * whose fval is less-than or equal to the fval component
<a name="l00419"></a>00419 * of the ACS record being processed.
<a name="l00420"></a>00420 * @param total_array The total component of the ACS record. total_array[i] is
<a name="l00421"></a>00421 * the number of samples whose <span class="keyword">class </span>code equals to i.
<a name="l00422"></a>00422 * @param true_total The real total number of samples currently assigned to
<a name="l00423"></a>00423 * the node identified by (tid, nid). If there are missing
<a name="l00424"></a>00424 * values in fval, the sum of all elements in total_array
<a name="l00425"></a>00425 * will be less than true_total.
<a name="l00426"></a>00426 *
<a name="l00427"></a>00427 * @<span class="keywordflow">return</span> A 9-element array. Please refer to the definition of SCV_STATE_ARRAY_INDEX
<a name="l00428"></a>00428 * in dt.c <span class="keywordflow">for</span> the detailed information of <span class="keyword">this</span> array.
<a name="l00429"></a>00429 *
<a name="l00430"></a>00430 */
<a name="l00431"></a>00431 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__scv_aggr_sfunc
<a name="l00432"></a>00432 (
<a name="l00433"></a>00433 result FLOAT8[],
<a name="l00434"></a>00434 sc_code INT,
<a name="l00435"></a>00435 is_cont BOOLEAN,
<a name="l00436"></a>00436 num_class INT,
<a name="l00437"></a>00437 le_array FLOAT8[],
<a name="l00438"></a>00438 total_array FLOAT8[],
<a name="l00439"></a>00439 true_total BIGINT
<a name="l00440"></a>00440 )
<a name="l00441"></a>00441 RETURNS FLOAT8[]
<a name="l00442"></a>00442 AS <span class="stringliteral">&#39;MODULE_PATHNAME&#39;</span>, <span class="stringliteral">&#39;dt_scv_aggr_sfunc&#39;</span>
<a name="l00443"></a>00443 LANGUAGE C IMMUTABLE;
<a name="l00444"></a>00444
<a name="l00445"></a>00445
<a name="l00446"></a>00446 <span class="comment">/*</span>
<a name="l00447"></a>00447 <span class="comment"> * @brief The pre-function for the aggregation of splitting criteria values. It </span>
<a name="l00448"></a>00448 <span class="comment"> * takes the state array produced by two sfunc and combine them together.</span>
<a name="l00449"></a>00449 <span class="comment"> *</span>
<a name="l00450"></a>00450 <span class="comment"> * @param sfunc1_result The array from sfunc1.</span>
<a name="l00451"></a>00451 <span class="comment"> * @param sfunc2_result The array from sfunc2.</span>
<a name="l00452"></a>00452 <span class="comment"> *</span>
<a name="l00453"></a>00453 <span class="comment"> * @return A 9-element array. Please refer to the definition of SCV_STATE_ARRAY_INDEX</span>
<a name="l00454"></a>00454 <span class="comment"> * in dt.c for the detailed information of this array.</span>
<a name="l00455"></a>00455 <span class="comment"> *</span>
<a name="l00456"></a>00456 <span class="comment"> */</span>
<a name="l00457"></a>00457 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__scv_aggr_prefunc
<a name="l00458"></a>00458 (
<a name="l00459"></a>00459 sfunc1_result FLOAT8[],
<a name="l00460"></a>00460 sfunc2_result FLOAT8[]
<a name="l00461"></a>00461 )
<a name="l00462"></a>00462 RETURNS FLOAT8[]
<a name="l00463"></a>00463 AS <span class="stringliteral">&#39;MODULE_PATHNAME&#39;</span>, <span class="stringliteral">&#39;dt_scv_aggr_prefunc&#39;</span>
<a name="l00464"></a>00464 LANGUAGE C STRICT IMMUTABLE;
<a name="l00465"></a>00465
<a name="l00466"></a>00466
<a name="l00467"></a>00467 <span class="comment">/*</span>
<a name="l00468"></a>00468 <span class="comment"> * @brief The final function for the aggregation of splitting criteria values.</span>
<a name="l00469"></a>00469 <span class="comment"> * It takes the state array produced by the sfunc and produces a</span>
<a name="l00470"></a>00470 <span class="comment"> * 5-element array.</span>
<a name="l00471"></a>00471 <span class="comment"> *</span>
<a name="l00472"></a>00472 <span class="comment"> * @param internal_result The 9-element array produced by dt_scv_aggr_prefunc</span>
<a name="l00473"></a>00473 <span class="comment"> *</span>
<a name="l00474"></a>00474 <span class="comment"> * @return A 5-element array. Please refer to the definition of SCV_FINAL_ARRAY_INDEX</span>
<a name="l00475"></a>00475 <span class="comment"> * in dt.c for the detailed information of this array.</span>
<a name="l00476"></a>00476 <span class="comment"> *</span>
<a name="l00477"></a>00477 <span class="comment"> */</span>
<a name="l00478"></a>00478 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__scv_aggr_ffunc
<a name="l00479"></a>00479 (
<a name="l00480"></a>00480 internal_result FLOAT8[]
<a name="l00481"></a>00481 )
<a name="l00482"></a>00482 RETURNS FLOAT8[]
<a name="l00483"></a>00483 AS <span class="stringliteral">&#39;MODULE_PATHNAME&#39;</span>, <span class="stringliteral">&#39;dt_scv_aggr_ffunc&#39;</span>
<a name="l00484"></a>00484 LANGUAGE C STRICT IMMUTABLE;
<a name="l00485"></a>00485
<a name="l00486"></a>00486
<a name="l00487"></a>00487 DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__scv_aggr
<a name="l00488"></a>00488 (
<a name="l00489"></a>00489 INT, -- sc
<a name="l00490"></a>00490 BOOLEAN, -- is_cont
<a name="l00491"></a>00491 INT, -- total number of classes
<a name="l00492"></a>00492 FLOAT8[], -- le array
<a name="l00493"></a>00493 FLOAT8[], -- total count array
<a name="l00494"></a>00494 BIGINT -- the total number of samples
<a name="l00495"></a>00495 ) CASCADE;
<a name="l00496"></a>00496 CREATE
<a name="l00497"></a>00497 AGGREGATE MADLIB_SCHEMA.__scv_aggr
<a name="l00498"></a>00498 (
<a name="l00499"></a>00499 INT, -- sc
<a name="l00500"></a>00500 BOOLEAN, -- is_cont
<a name="l00501"></a>00501 INT, -- total number of classes
<a name="l00502"></a>00502 FLOAT8[], -- le array
<a name="l00503"></a>00503 FLOAT8[], -- total count array
<a name="l00504"></a>00504 BIGINT -- the total number of samples
<a name="l00505"></a>00505 )
<a name="l00506"></a>00506 (
<a name="l00507"></a>00507 SFUNC=MADLIB_SCHEMA.__scv_aggr_sfunc,
<a name="l00508"></a>00508 m4_ifdef(`__GREENPLUM__<span class="stringliteral">&#39;, `prefunc=MADLIB_SCHEMA.__scv_aggr_prefunc,&#39;</span>)
<a name="l00509"></a>00509 FINALFUNC=MADLIB_SCHEMA.__scv_aggr_ffunc,
<a name="l00510"></a>00510 STYPE=FLOAT8[],
<a name="l00511"></a>00511 initcond = &#39;{0, 0, 0, 0, 0, 0, 0, 0, 0}<span class="stringliteral">&#39;</span>
<a name="l00512"></a>00512 <span class="stringliteral"> -- 1 sc: 1 infogain, 2 gainratio, 3 gini</span>
<a name="l00513"></a>00513 <span class="stringliteral"> -- 2 is_cont</span>
<a name="l00514"></a>00514 <span class="stringliteral"> -- 3 scv_class_info</span>
<a name="l00515"></a>00515 <span class="stringliteral"> -- 4 scv_attr_info</span>
<a name="l00516"></a>00516 <span class="stringliteral"> -- 5 scv_class_attr_info</span>
<a name="l00517"></a>00517 <span class="stringliteral"> -- 6 scv_count</span>
<a name="l00518"></a>00518 <span class="stringliteral"> -- 7 scv_total</span>
<a name="l00519"></a>00519 <span class="stringliteral"> -- 8 max_class_id</span>
<a name="l00520"></a>00520 <span class="stringliteral"> -- 9 max_class_count</span>
<a name="l00521"></a>00521 <span class="stringliteral">);</span>
<a name="l00522"></a>00522 <span class="stringliteral"></span>
<a name="l00523"></a>00523 <span class="stringliteral"></span>
<a name="l00524"></a>00524 <span class="stringliteral">/*</span>
<a name="l00525"></a>00525 <span class="stringliteral"> * @brief Retrieve the specified number of unique features for a node.</span>
<a name="l00526"></a>00526 <span class="stringliteral"> * Discrete features used by ancestor nodes will be excluded.</span>
<a name="l00527"></a>00527 <span class="stringliteral"> * If the number of remaining features is less or equal than the</span>
<a name="l00528"></a>00528 <span class="stringliteral"> * requested number of features, then all the remaining features</span>
<a name="l00529"></a>00529 <span class="stringliteral"> * will be returned. Otherwise, we will sample the requested </span>
<a name="l00530"></a>00530 <span class="stringliteral"> * number of features from the remaining features.</span>
<a name="l00531"></a>00531 <span class="stringliteral"> *</span>
<a name="l00532"></a>00532 <span class="stringliteral"> * @param num_req_features The number of requested features.</span>
<a name="l00533"></a>00533 <span class="stringliteral"> * @param num_features The total number of features.</span>
<a name="l00534"></a>00534 <span class="stringliteral"> * @param nid The ID of the node for which the</span>
<a name="l00535"></a>00535 <span class="stringliteral"> * features are sampled.</span>
<a name="l00536"></a>00536 <span class="stringliteral"> * @param dp_fids The IDs of the discrete features</span>
<a name="l00537"></a>00537 <span class="stringliteral"> * used by the ancestors.</span>
<a name="l00538"></a>00538 <span class="stringliteral"> *</span>
<a name="l00539"></a>00539 <span class="stringliteral"> * @return An array containing all the IDs of chosen features.</span>
<a name="l00540"></a>00540 <span class="stringliteral"> *</span>
<a name="l00541"></a>00541 <span class="stringliteral"> */</span>
<a name="l00542"></a>00542 <span class="stringliteral">CREATE OR REPLACE FUNCTION </span>
<a name="l00543"></a>00543 <span class="stringliteral">MADLIB_SCHEMA.__dt_get_node_split_fids(INT4, INT4, INT4, INT4[])</span>
<a name="l00544"></a>00544 <span class="stringliteral">RETURNS INT[]</span>
<a name="l00545"></a>00545 <span class="stringliteral">AS &#39;</span>MODULE_PATHNAME<span class="stringliteral">&#39;, &#39;</span>dt_get_node_split_fids<span class="stringliteral">&#39;</span>
<a name="l00546"></a>00546 <span class="stringliteral">LANGUAGE C VOLATILE;</span>
<a name="l00547"></a>00547 <span class="stringliteral"></span>
<a name="l00548"></a>00548 <span class="stringliteral"></span>
<a name="l00549"></a>00549 <span class="stringliteral">/*</span>
<a name="l00550"></a>00550 <span class="stringliteral"> * @brief Retrieve the selected features for a node. We will create a table, named </span>
<a name="l00551"></a>00551 <span class="stringliteral"> * sf_association, to store the association between selected feature IDs and</span>
<a name="l00552"></a>00552 <span class="stringliteral"> * node IDs.</span>
<a name="l00553"></a>00553 <span class="stringliteral"> *</span>
<a name="l00554"></a>00554 <span class="stringliteral"> * @param nid_table_name The full name of the table which contains all the </span>
<a name="l00555"></a>00555 <span class="stringliteral"> * node IDs.</span>
<a name="l00556"></a>00556 <span class="stringliteral"> * @param result_table_name The full name of the table which contains the parent</span>
<a name="l00557"></a>00557 <span class="stringliteral"> * discrete features for each node.</span>
<a name="l00558"></a>00558 <span class="stringliteral"> * @param num_chosen_fids The number of feature IDs will be chosen for a node.</span>
<a name="l00559"></a>00559 <span class="stringliteral"> * @param total_num_fids The total number of feature IDs, total_num_fids </span>
<a name="l00560"></a>00560 <span class="stringliteral"> * &gt;= num_chosen_fids.</span>
<a name="l00561"></a>00561 <span class="stringliteral"> * If num_chosen_fids &lt; total_num_fids, then we will </span>
<a name="l00562"></a>00562 <span class="stringliteral"> * randomly select num_chosen_fids features from all</span>
<a name="l00563"></a>00563 <span class="stringliteral"> * the features. Otherwise, we will return all the </span>
<a name="l00564"></a>00564 <span class="stringliteral"> * features exception they belong to the parent discrete</span>
<a name="l00565"></a>00565 <span class="stringliteral"> * features for a node.</span>
<a name="l00566"></a>00566 <span class="stringliteral"> * @param verbosity &gt; 0 means this function runs in verbose mode.</span>
<a name="l00567"></a>00567 <span class="stringliteral"> * </span>
<a name="l00568"></a>00568 <span class="stringliteral"> * @return An constant string for the association table name.</span>
<a name="l00569"></a>00569 <span class="stringliteral"> *</span>
<a name="l00570"></a>00570 <span class="stringliteral"> */</span>
<a name="l00571"></a>00571 <span class="stringliteral">CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_features_of_nodes</span>
<a name="l00572"></a>00572 <span class="stringliteral"> (</span>
<a name="l00573"></a>00573 <span class="stringliteral"> nid_table_name TEXT,</span>
<a name="l00574"></a>00574 <span class="stringliteral"> result_table_name TEXT,</span>
<a name="l00575"></a>00575 <span class="stringliteral"> num_chosen_fids INT,</span>
<a name="l00576"></a>00576 <span class="stringliteral"> total_num_fids INT,</span>
<a name="l00577"></a>00577 <span class="stringliteral"> verbosity INT</span>
<a name="l00578"></a>00578 <span class="stringliteral"> )</span>
<a name="l00579"></a>00579 <span class="stringliteral">RETURNS TEXT AS $$</span>
<a name="l00580"></a>00580 <span class="stringliteral">DECLARE</span>
<a name="l00581"></a>00581 <span class="stringliteral"> curstmt TEXT;</span>
<a name="l00582"></a>00582 <span class="stringliteral">BEGIN</span>
<a name="l00583"></a>00583 <span class="stringliteral"> -- The sf_association table records which features are used</span>
<a name="l00584"></a>00584 <span class="stringliteral"> -- for finding the best split for a node.</span>
<a name="l00585"></a>00585 <span class="stringliteral"> -- It has two columns:</span>
<a name="l00586"></a>00586 <span class="stringliteral"> -- nid -- The id of a node.</span>
<a name="l00587"></a>00587 <span class="stringliteral"> -- fid -- The id of a feature.</span>
<a name="l00588"></a>00588 <span class="stringliteral"> EXECUTE &#39;</span>TRUNCATE sf_assoc<span class="stringliteral">&#39;;</span>
<a name="l00589"></a>00589 <span class="stringliteral"> </span>
<a name="l00590"></a>00590 <span class="stringliteral"> curstmt = MADLIB_SCHEMA.__format</span>
<a name="l00591"></a>00591 <span class="stringliteral"> (</span>
<a name="l00592"></a>00592 <span class="stringliteral"> &#39;</span>INSERT INTO sf_assoc(nid, fid)
<a name="l00593"></a>00593 SELECT
<a name="l00594"></a>00594 nid,
<a name="l00595"></a>00595 unnest(MADLIB_SCHEMA.__dt_get_node_split_fids(%, %,
<a name="l00596"></a>00596 nid,dp_ids)) as fid
<a name="l00597"></a>00597 FROM (SELECT nid, dp_ids
<a name="l00598"></a>00598 FROM % s1, % s2
<a name="l00599"></a>00599 WHERE s1.nid = s2.<span class="keywordtype">id</span>
<a name="l00600"></a>00600 GROUP BY nid, dp_ids) t&#39;,
<a name="l00601"></a>00601 ARRAY[
<a name="l00602"></a>00602 num_chosen_fids::TEXT,
<a name="l00603"></a>00603 total_num_fids::TEXT,
<a name="l00604"></a>00604 nid_table_name,
<a name="l00605"></a>00605 result_table_name
<a name="l00606"></a>00606 ]
<a name="l00607"></a>00607 );
<a name="l00608"></a>00608
<a name="l00609"></a>00609 IF (verbosity &gt; 0) THEN
<a name="l00610"></a>00610 RAISE INFO &#39;build sample feature association stmt: %&#39;, curstmt;
<a name="l00611"></a>00611 END IF;
<a name="l00612"></a>00612
<a name="l00613"></a>00613 EXECUTE curstmt;
<a name="l00614"></a>00614
<a name="l00615"></a>00615 -- we return an constant <span class="keywordtype">string</span> for the association table name
<a name="l00616"></a>00616 return &#39;sf_assoc&#39;;
<a name="l00617"></a>00617
<a name="l00618"></a>00618 END
<a name="l00619"></a>00619 $$ LANGUAGE PLPGSQL;
<a name="l00620"></a>00620
<a name="l00621"></a>00621
<a name="l00622"></a>00622 <span class="comment">/*</span>
<a name="l00623"></a>00623 <span class="comment"> * This UDT is used to keep the times of generating acc.</span>
<a name="l00624"></a>00624 <span class="comment"> *</span>
<a name="l00625"></a>00625 <span class="comment"> * calc_pre_time The time of pre-processing.</span>
<a name="l00626"></a>00626 <span class="comment"> * calc_acc_time The time of calculating acc.</span>
<a name="l00627"></a>00627 <span class="comment"> *</span>
<a name="l00628"></a>00628 <span class="comment"> */</span>
<a name="l00629"></a>00629 DROP TYPE IF EXISTS MADLIB_SCHEMA.__gen_acc_time;
<a name="l00630"></a>00630 CREATE TYPE MADLIB_SCHEMA.__gen_acc_time AS
<a name="l00631"></a>00631 (
<a name="l00632"></a>00632 calc_pre_time INTERVAL,
<a name="l00633"></a>00633 calc_acc_time INTERVAL
<a name="l00634"></a>00634 );
<a name="l00635"></a>00635
<a name="l00636"></a>00636
<a name="l00637"></a>00637 <span class="comment">/*</span>
<a name="l00638"></a>00638 <span class="comment"> * @brief Generate the ACC for current leaf nodes.</span>
<a name="l00639"></a>00639 <span class="comment"> *</span>
<a name="l00640"></a>00640 <span class="comment"> * @param encoded_table_name The full name of the encoded table for the </span>
<a name="l00641"></a>00641 <span class="comment"> * training table.</span>
<a name="l00642"></a>00642 <span class="comment"> * @param metatable_name The full name of the metatable contains the </span>
<a name="l00643"></a>00643 <span class="comment"> * relevant information of the input table.</span>
<a name="l00644"></a>00644 <span class="comment"> * @param result_table_name The full name of the training result table.</span>
<a name="l00645"></a>00645 <span class="comment"> * @param num_featrue_try The number of features will be chosen per node. </span>
<a name="l00646"></a>00646 <span class="comment"> * @param num_classes Total number of classes in training set.</span>
<a name="l00647"></a>00647 <span class="comment"> * @param verbosity &gt; 0 means this function runs in verbose mode. </span>
<a name="l00648"></a>00648 <span class="comment"> * </span>
<a name="l00649"></a>00649 <span class="comment"> * @return The time information for generating ACC.</span>
<a name="l00650"></a>00650 <span class="comment"> *</span>
<a name="l00651"></a>00651 <span class="comment"> */</span>
<a name="l00652"></a>00652 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__gen_acc
<a name="l00653"></a>00653 (
<a name="l00654"></a>00654 encoded_table_name TEXT,
<a name="l00655"></a>00655 metatable_name TEXT,
<a name="l00656"></a>00656 result_table_name TEXT,
<a name="l00657"></a>00657 tr_table_name TEXT,
<a name="l00658"></a>00658 sf_table_name TEXT,
<a name="l00659"></a>00659 num_featrue_try INT,
<a name="l00660"></a>00660 num_classes INT,
<a name="l00661"></a>00661 sampling_needed BOOLEAN,
<a name="l00662"></a>00662 verbosity INT
<a name="l00663"></a>00663 )
<a name="l00664"></a>00664 RETURNS MADLIB_SCHEMA.__gen_acc_time AS $$
<a name="l00665"></a>00665 DECLARE
<a name="l00666"></a>00666 curstmt TEXT := &#39;&#39;;
<a name="l00667"></a>00667 num_fids INT := 1;
<a name="l00668"></a>00668 begin_calc_acc TIMESTAMP;
<a name="l00669"></a>00669 begin_calc_pre TIMESTAMP;
<a name="l00670"></a>00670 ret MADLIB_SCHEMA.__gen_acc_time;
<a name="l00671"></a>00671 select_stmt TEXT;
<a name="l00672"></a>00672 BEGIN
<a name="l00673"></a>00673 begin_calc_pre = clock_timestamp();
<a name="l00674"></a>00674
<a name="l00675"></a>00675 -- get the number of features
<a name="l00676"></a>00676 curstmt = MADLIB_SCHEMA.__format
<a name="l00677"></a>00677 (
<a name="l00678"></a>00678 &#39;SELECT COUNT(<span class="keywordtype">id</span>)
<a name="l00679"></a>00679 FROM %
<a name="l00680"></a>00680 WHERE column_type = &#39;&#39;f&#39;&#39;&#39;,
<a name="l00681"></a>00681 metatable_name
<a name="l00682"></a>00682 );
<a name="l00683"></a>00683 EXECUTE curstmt INTO num_fids;
<a name="l00684"></a>00684
<a name="l00685"></a>00685 -- preprocessing time
<a name="l00686"></a>00686 ret.calc_pre_time = clock_timestamp() - begin_calc_pre;
<a name="l00687"></a>00687 begin_calc_acc = clock_timestamp();
<a name="l00688"></a>00688
<a name="l00689"></a>00689 IF (sampling_needed) THEN
<a name="l00690"></a>00690 PERFORM MADLIB_SCHEMA.__get_features_of_nodes
<a name="l00691"></a>00691 (
<a name="l00692"></a>00692 tr_table_name,
<a name="l00693"></a>00693 result_table_name,
<a name="l00694"></a>00694 num_featrue_try,
<a name="l00695"></a>00695 num_fids,
<a name="l00696"></a>00696 verbosity
<a name="l00697"></a>00697 );
<a name="l00698"></a>00698
<a name="l00699"></a>00699 select_stmt = MADLIB_SCHEMA.__format
<a name="l00700"></a>00700 (
<a name="l00701"></a>00701 &#39;SELECT tr.tid, tr.nid, ed.fid, ed.fval, ed.is_cont,
<a name="l00702"></a>00702 ed.class, sum(weight) as count
<a name="l00703"></a>00703 FROM % ed, % tr, % sf
<a name="l00704"></a>00704 WHERE tr.nid = sf.nid AND ed.fid = sf.fid AND ed.<span class="keywordtype">id</span> = tr.<span class="keywordtype">id</span>
<a name="l00705"></a>00705 GROUP BY tr.tid, tr.nid, ed.fid, ed.fval,
<a name="l00706"></a>00706 ed.is_cont, ed.class&#39;,
<a name="l00707"></a>00707 ARRAY[
<a name="l00708"></a>00708 encoded_table_name,
<a name="l00709"></a>00709 tr_table_name,
<a name="l00710"></a>00710 sf_table_name
<a name="l00711"></a>00711 ]
<a name="l00712"></a>00712 );
<a name="l00713"></a>00713 ELSE
<a name="l00714"></a>00714 select_stmt = MADLIB_SCHEMA.__format
<a name="l00715"></a>00715 (
<a name="l00716"></a>00716 &#39;SELECT tr.tid, tr.nid, ed.fid, ed.fval, ed.is_cont,
<a name="l00717"></a>00717 ed.class, sum(weight) as count
<a name="l00718"></a>00718 FROM % ed, % tr
<a name="l00719"></a>00719 WHERE ed.<span class="keywordtype">id</span> = tr.<span class="keywordtype">id</span>
<a name="l00720"></a>00720 GROUP BY tr.tid, tr.nid, ed.fid, ed.fval,
<a name="l00721"></a>00721 ed.is_cont, ed.class&#39;,
<a name="l00722"></a>00722 ARRAY[
<a name="l00723"></a>00723 encoded_table_name,
<a name="l00724"></a>00724 tr_table_name
<a name="l00725"></a>00725 ]
<a name="l00726"></a>00726 );
<a name="l00727"></a>00727 END IF;
<a name="l00728"></a>00728 DROP TABLE IF EXISTS training_instance_aux;
<a name="l00729"></a>00729 curstmt = MADLIB_SCHEMA.__format
<a name="l00730"></a>00730 (
<a name="l00731"></a>00731 &#39;CREATE TEMP TABLE training_instance_aux AS
<a name="l00732"></a>00732 SELECT tid, nid, fid, fval, is_cont,
<a name="l00733"></a>00733 MADLIB_SCHEMA.__dt_acc_count_aggr
<a name="l00734"></a>00734 (%,count::BIGINT,class::INT) AS count
<a name="l00735"></a>00735 FROM
<a name="l00736"></a>00736 (
<a name="l00737"></a>00737 %
<a name="l00738"></a>00738 ) l
<a name="l00739"></a>00739 GROUP BY tid,nid,fid, fval,is_cont
<a name="l00740"></a>00740 m4_ifdef(`__GREENPLUM__&#39;, `DISTRIBUTED BY (fid, fval)&#39;)&#39;,
<a name="l00741"></a>00741 ARRAY[
<a name="l00742"></a>00742 num_classes::TEXT,
<a name="l00743"></a>00743 select_stmt
<a name="l00744"></a>00744 ]
<a name="l00745"></a>00745 );
<a name="l00746"></a>00746
<a name="l00747"></a>00747 IF ( verbosity&gt;0 ) THEN
<a name="l00748"></a>00748 RAISE INFO &#39;%&#39;, curstmt;
<a name="l00749"></a>00749 END IF;
<a name="l00750"></a>00750
<a name="l00751"></a>00751 EXECUTE curstmt;
<a name="l00752"></a>00752 ret.calc_acc_time = clock_timestamp() - begin_calc_acc;
<a name="l00753"></a>00753
<a name="l00754"></a>00754 RETURN ret;
<a name="l00755"></a>00755 END
<a name="l00756"></a>00756 $$ LANGUAGE PLPGSQL;
<a name="l00757"></a>00757
<a name="l00758"></a>00758
<a name="l00759"></a>00759 DROP TYPE IF EXISTS MADLIB_SCHEMA.__rep_type CASCADE;
<a name="l00760"></a>00760 CREATE TYPE MADLIB_SCHEMA.__rep_type AS
<a name="l00761"></a>00761 (
<a name="l00762"></a>00762 numOfOrgClasses BIGINT[]
<a name="l00763"></a>00763 );
<a name="l00764"></a>00764
<a name="l00765"></a>00765
<a name="l00766"></a>00766 <span class="comment">/*</span>
<a name="l00767"></a>00767 <span class="comment"> * @brief The step function for aggregating the class counts while doing Reduce </span>
<a name="l00768"></a>00768 <span class="comment"> * Error Pruning (REP).</span>
<a name="l00769"></a>00769 <span class="comment"> *</span>
<a name="l00770"></a>00770 <span class="comment"> * @param class_count_array The array used to store the accumulated information.</span>
<a name="l00771"></a>00771 <span class="comment"> * [0]: the total number of mis-classified samples.</span>
<a name="l00772"></a>00772 <span class="comment"> * [i]: the number of samples belonging to the ith class.</span>
<a name="l00773"></a>00773 <span class="comment"> * @param classified_class The predicted class based on our trained DT model.</span>
<a name="l00774"></a>00774 <span class="comment"> * @param original_class The real class value provided in the validation set.</span>
<a name="l00775"></a>00775 <span class="comment"> * @param max_num_of_classes The total number of distinct class values. </span>
<a name="l00776"></a>00776 <span class="comment"> * </span>
<a name="l00777"></a>00777 <span class="comment"> * @return An updated class count array.</span>
<a name="l00778"></a>00778 <span class="comment"> *</span>
<a name="l00779"></a>00779 <span class="comment"> */</span>
<a name="l00780"></a>00780 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__rep_aggr_class_count_sfunc
<a name="l00781"></a>00781 (
<a name="l00782"></a>00782 class_count_array BIGINT[],
<a name="l00783"></a>00783 classified_class INT,
<a name="l00784"></a>00784 original_class INT,
<a name="l00785"></a>00785 max_num_of_classes INT
<a name="l00786"></a>00786 )
<a name="l00787"></a>00787 RETURNS BIGINT[]
<a name="l00788"></a>00788 AS &#39;MODULE_PATHNAME&#39;, &#39;dt_rep_aggr_class_count_sfunc&#39;
<a name="l00789"></a>00789 LANGUAGE C IMMUTABLE;
<a name="l00790"></a>00790
<a name="l00791"></a>00791
<a name="l00792"></a>00792 <span class="comment">/*</span>
<a name="l00793"></a>00793 <span class="comment"> * @brief Add the corresponding elements of the input arrays </span>
<a name="l00794"></a>00794 <span class="comment"> * to create a new one.</span>
<a name="l00795"></a>00795 <span class="comment"> *</span>
<a name="l00796"></a>00796 <span class="comment"> * @param 1 arg The array 1.</span>
<a name="l00797"></a>00797 <span class="comment"> * @param 2 arg The array 2.</span>
<a name="l00798"></a>00798 <span class="comment"> * </span>
<a name="l00799"></a>00799 <span class="comment"> * @return The new array.</span>
<a name="l00800"></a>00800 <span class="comment"> *</span>
<a name="l00801"></a>00801 <span class="comment"> */</span>
<a name="l00802"></a>00802 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__bigint_array_add
<a name="l00803"></a>00803 (
<a name="l00804"></a>00804 BIGINT[],
<a name="l00805"></a>00805 BIGINT[]
<a name="l00806"></a>00806 )
<a name="l00807"></a>00807 RETURNS BIGINT[]
<a name="l00808"></a>00808 AS &#39;MODULE_PATHNAME&#39;, &#39;bigint_array_add&#39;
<a name="l00809"></a>00809 LANGUAGE C IMMUTABLE;
<a name="l00810"></a>00810
<a name="l00811"></a>00811
<a name="l00812"></a>00812 <span class="comment">/*</span>
<a name="l00813"></a>00813 <span class="comment"> * @brief The final function for aggregating the class counts for REP. </span>
<a name="l00814"></a>00814 <span class="comment"> * It takes the class count array produced by the sfunc and produces a </span>
<a name="l00815"></a>00815 <span class="comment"> * two-element array. The first element is the ID of the class that has </span>
<a name="l00816"></a>00816 <span class="comment"> * the maximum number of samples represented by the root node of the subtree</span>
<a name="l00817"></a>00817 <span class="comment"> * being processed. The second element is the number of reduced </span>
<a name="l00818"></a>00818 <span class="comment"> * misclassified samples if the leave nodes of the subtree are pruned.</span>
<a name="l00819"></a>00819 <span class="comment"> *</span>
<a name="l00820"></a>00820 <span class="comment"> * @param class_count_data The array containing all the information for the </span>
<a name="l00821"></a>00821 <span class="comment"> * calculation of Reduced-Error pruning. </span>
<a name="l00822"></a>00822 <span class="comment"> * </span>
<a name="l00823"></a>00823 <span class="comment"> * @return A two element array.</span>
<a name="l00824"></a>00824 <span class="comment"> *</span>
<a name="l00825"></a>00825 <span class="comment"> */</span>
<a name="l00826"></a>00826 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__rep_aggr_class_count_ffunc
<a name="l00827"></a>00827 (
<a name="l00828"></a>00828 class_count_array BIGINT[]
<a name="l00829"></a>00829 )
<a name="l00830"></a>00830 RETURNS BIGINT[]
<a name="l00831"></a>00831 AS &#39;MODULE_PATHNAME&#39;, &#39;dt_rep_aggr_class_count_ffunc&#39;
<a name="l00832"></a>00832 LANGUAGE C STRICT IMMUTABLE;
<a name="l00833"></a>00833
<a name="l00834"></a>00834
<a name="l00835"></a>00835 DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__rep_aggr_class_count
<a name="l00836"></a>00836 (
<a name="l00837"></a>00837 INT,
<a name="l00838"></a>00838 INT,
<a name="l00839"></a>00839 INT
<a name="l00840"></a>00840 );
<a name="l00841"></a>00841 CREATE AGGREGATE MADLIB_SCHEMA.__rep_aggr_class_count
<a name="l00842"></a>00842 (
<a name="l00843"></a>00843 INT,
<a name="l00844"></a>00844 INT,
<a name="l00845"></a>00845 INT
<a name="l00846"></a>00846 )
<a name="l00847"></a>00847 (
<a name="l00848"></a>00848 SFUNC=MADLIB_SCHEMA.__rep_aggr_class_count_sfunc,
<a name="l00849"></a>00849 m4_ifdef(`__GREENPLUM__&#39;, `prefunc=MADLIB_SCHEMA.__bigint_array_add,&#39;)
<a name="l00850"></a>00850 FINALFUNC=MADLIB_SCHEMA.__rep_aggr_class_count_ffunc,
<a name="l00851"></a>00851 STYPE=BIGINT[]
<a name="l00852"></a>00852 );
<a name="l00853"></a>00853
<a name="l00854"></a>00854
<a name="l00855"></a>00855 <span class="comment">/*</span>
<a name="l00856"></a>00856 <span class="comment"> * @brief The step function of the aggregate __array_indexed_agg.</span>
<a name="l00857"></a>00857 <span class="comment"> *</span>
<a name="l00858"></a>00858 <span class="comment"> * @param state The step state array of the aggregate function.</span>
<a name="l00859"></a>00859 <span class="comment"> * @param elem The element to be filled into the state array.</span>
<a name="l00860"></a>00860 <span class="comment"> * @param elem_cnt The number of elements.</span>
<a name="l00861"></a>00861 <span class="comment"> * @param elem_idx the subscript of &quot;elem&quot; in the state array.</span>
<a name="l00862"></a>00862 <span class="comment"> * </span>
<a name="l00863"></a>00863 <span class="comment"> */</span>
<a name="l00864"></a>00864 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__array_indexed_agg_sfunc
<a name="l00865"></a>00865 (
<a name="l00866"></a>00866 state float8[],
<a name="l00867"></a>00867 elem float8,
<a name="l00868"></a>00868 elem_cnt int8,
<a name="l00869"></a>00869 elem_idx int8
<a name="l00870"></a>00870 )
<a name="l00871"></a>00871 RETURNS float8[]
<a name="l00872"></a>00872 AS &#39;MODULE_PATHNAME&#39;, &#39;dt_array_indexed_agg_sfunc&#39;
<a name="l00873"></a>00873 LANGUAGE C IMMUTABLE;
<a name="l00874"></a>00874
<a name="l00875"></a>00875
<a name="l00876"></a>00876 <span class="comment">/*</span>
<a name="l00877"></a>00877 <span class="comment"> * @brief The Pre-function of the aggregate __array_indexed_agg.</span>
<a name="l00878"></a>00878 <span class="comment"> * </span>
<a name="l00879"></a>00879 <span class="comment"> * @param arg0 The first state array.</span>
<a name="l00880"></a>00880 <span class="comment"> * @param arg1 The second state array.</span>
<a name="l00881"></a>00881 <span class="comment"> * </span>
<a name="l00882"></a>00882 <span class="comment"> * @return The combined state. </span>
<a name="l00883"></a>00883 <span class="comment"> *</span>
<a name="l00884"></a>00884 <span class="comment"> */</span>
<a name="l00885"></a>00885 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__array_indexed_agg_prefunc
<a name="l00886"></a>00886 (
<a name="l00887"></a>00887 float8[],
<a name="l00888"></a>00888 float8[]
<a name="l00889"></a>00889 )
<a name="l00890"></a>00890 RETURNS float8[]
<a name="l00891"></a>00891 AS &#39;MODULE_PATHNAME&#39;, &#39;dt_array_indexed_agg_prefunc&#39;
<a name="l00892"></a>00892 LANGUAGE C STRICT IMMUTABLE;
<a name="l00893"></a>00893
<a name="l00894"></a>00894
<a name="l00895"></a>00895 <span class="comment">/*</span>
<a name="l00896"></a>00896 <span class="comment"> * @brief The final function of __array_indexed_agg.</span>
<a name="l00897"></a>00897 <span class="comment"> * </span>
<a name="l00898"></a>00898 <span class="comment"> * @param state The state array.</span>
<a name="l00899"></a>00899 <span class="comment"> * </span>
<a name="l00900"></a>00900 <span class="comment"> * @return The aggregate result.</span>
<a name="l00901"></a>00901 <span class="comment"> *</span>
<a name="l00902"></a>00902 <span class="comment"> */</span>
<a name="l00903"></a>00903 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__array_indexed_agg_ffunc
<a name="l00904"></a>00904 (
<a name="l00905"></a>00905 float8[]
<a name="l00906"></a>00906 )
<a name="l00907"></a>00907 RETURNS float8[]
<a name="l00908"></a>00908 AS &#39;MODULE_PATHNAME&#39;, &#39;dt_array_indexed_agg_ffunc&#39;
<a name="l00909"></a>00909 LANGUAGE C IMMUTABLE;
<a name="l00910"></a>00910
<a name="l00911"></a>00911
<a name="l00912"></a>00912 <span class="comment">/*</span>
<a name="l00913"></a>00913 <span class="comment"> * @brief The aggregate is the same with array_agg, which will accumulate</span>
<a name="l00914"></a>00914 <span class="comment"> * The elements in each group to an array, except that we allow users </span>
<a name="l00915"></a>00915 <span class="comment"> * provide the subscript for each element. This aggregate will be </span>
<a name="l00916"></a>00916 <span class="comment"> * invoked as HashAggregate, while array_agg will be called as </span>
<a name="l00917"></a>00917 <span class="comment"> * GroupAggregate. Therefore, our implementation have better performance</span>
<a name="l00918"></a>00918 <span class="comment"> * than the array_agg.</span>
<a name="l00919"></a>00919 <span class="comment"> * </span>
<a name="l00920"></a>00920 <span class="comment"> * @param elem The element to be fed into the returned array of this aggregate.</span>
<a name="l00921"></a>00921 <span class="comment"> * @param elem_cnt The number of elements.</span>
<a name="l00922"></a>00922 <span class="comment"> * @param elem_idx The subscript of the element.</span>
<a name="l00923"></a>00923 <span class="comment"> *</span>
<a name="l00924"></a>00924 <span class="comment"> * @return The aggregated array.</span>
<a name="l00925"></a>00925 <span class="comment"> *</span>
<a name="l00926"></a>00926 <span class="comment"> */</span>
<a name="l00927"></a>00927 CREATE AGGREGATE MADLIB_SCHEMA.__array_indexed_agg(float8, int8, int8) (
<a name="l00928"></a>00928 SFUNC = MADLIB_SCHEMA.__array_indexed_agg_sfunc,
<a name="l00929"></a>00929 m4_ifdef( `__GREENPLUM__&#39;,`PREFUNC = MADLIB_SCHEMA.__array_indexed_agg_prefunc,&#39;)
<a name="l00930"></a>00930 FINALFUNC = MADLIB_SCHEMA.__array_indexed_agg_ffunc,
<a name="l00931"></a>00931 STYPE = float8[]
<a name="l00932"></a>00932 );
<a name="l00933"></a>00933
<a name="l00934"></a>00934
<a name="l00935"></a>00935 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__dt_acc_count_sfunc
<a name="l00936"></a>00936 (
<a name="l00937"></a>00937 count_array BIGINT[],
<a name="l00938"></a>00938 num_of_class INT,
<a name="l00939"></a>00939 count BIGINT,
<a name="l00940"></a>00940 class INT
<a name="l00941"></a>00941 )
<a name="l00942"></a>00942 RETURNS BIGINT[]
<a name="l00943"></a>00943 AS &#39;MODULE_PATHNAME&#39;, &#39;dt_acc_count_sfunc&#39;
<a name="l00944"></a>00944 LANGUAGE C VOLATILE;
<a name="l00945"></a>00945
<a name="l00946"></a>00946
<a name="l00947"></a>00947 CREATE AGGREGATE MADLIB_SCHEMA.__dt_acc_count_aggr
<a name="l00948"></a>00948 (
<a name="l00949"></a>00949 INT,
<a name="l00950"></a>00950 BIGINT,
<a name="l00951"></a>00951 INT
<a name="l00952"></a>00952 )
<a name="l00953"></a>00953 (
<a name="l00954"></a>00954 SFUNC=MADLIB_SCHEMA.__dt_acc_count_sfunc,
<a name="l00955"></a>00955 m4_ifdef(`__GREENPLUM__&#39;, `prefunc=MADLIB_SCHEMA.__bigint_array_add,&#39;)
<a name="l00956"></a>00956 STYPE=BIGINT[]
<a name="l00957"></a>00957 );
<a name="l00958"></a>00958
<a name="l00959"></a>00959
<a name="l00960"></a>00960 <span class="comment">/*</span>
<a name="l00961"></a>00961 <span class="comment"> * @brief The aggregate is created for the PostgreSQL, which doesn&#39;t support the</span>
<a name="l00962"></a>00962 <span class="comment"> * function sum over an array.</span>
<a name="l00963"></a>00963 <span class="comment"> * </span>
<a name="l00964"></a>00964 <span class="comment"> * @param elem The element to be fed into the returned array of this aggregate.</span>
<a name="l00965"></a>00965 <span class="comment"> *</span>
<a name="l00966"></a>00966 <span class="comment"> * @return The array with the sum of all the input array in a group.</span>
<a name="l00967"></a>00967 <span class="comment"> *</span>
<a name="l00968"></a>00968 <span class="comment"> */</span>
<a name="l00969"></a>00969 CREATE
<a name="l00970"></a>00970 AGGREGATE MADLIB_SCHEMA.__bigint_array_sum
<a name="l00971"></a>00971 (
<a name="l00972"></a>00972 BIGINT[]
<a name="l00973"></a>00973 )
<a name="l00974"></a>00974 (
<a name="l00975"></a>00975 SFUNC=MADLIB_SCHEMA.__bigint_array_add,
<a name="l00976"></a>00976 m4_ifdef(`__GREENPLUM__&#39;, `prefunc=MADLIB_SCHEMA.__bigint_array_add,&#39;)
<a name="l00977"></a>00977 STYPE=BIGINT[]
<a name="l00978"></a>00978 );
<a name="l00979"></a>00979
<a name="l00980"></a>00980
<a name="l00981"></a>00981 <span class="comment">/*</span>
<a name="l00982"></a>00982 <span class="comment"> * @brief This function find the best split and return the information.</span>
<a name="l00983"></a>00983 <span class="comment"> *</span>
<a name="l00984"></a>00984 <span class="comment"> * @param table_name The name of the table containing the training</span>
<a name="l00985"></a>00985 <span class="comment"> * set.</span>
<a name="l00986"></a>00986 <span class="comment"> * @param confidence_level This parameter is used by the &#39;Error-Based Pruning&#39;.</span>
<a name="l00987"></a>00987 <span class="comment"> * Please refer to the paper for detailed definition.</span>
<a name="l00988"></a>00988 <span class="comment"> * The paper&#39;s name is &#39;Error-Based Pruning of Decision </span>
<a name="l00989"></a>00989 <span class="comment"> * Trees Grown on Very Large Data Sets Can Work!&#39;.</span>
<a name="l00990"></a>00990 <span class="comment"> * @param feature_table_name Is is the name of one internal table, which contains</span>
<a name="l00991"></a>00991 <span class="comment"> * meta data for each feature.</span>
<a name="l00992"></a>00992 <span class="comment"> * @param split_criterion It defines the split criterion to be used.</span>
<a name="l00993"></a>00993 <span class="comment"> * (1- information gain. 2- gain ratio. 3- gini).</span>
<a name="l00994"></a>00994 <span class="comment"> * @param continue_grow It specifies whether we should still grow the tree</span>
<a name="l00995"></a>00995 <span class="comment"> * on the selected branch.</span>
<a name="l00996"></a>00996 <span class="comment"> * @param output_table It specifies the table used to store the chosen splits.</span>
<a name="l00997"></a>00997 <span class="comment"> * @param h2hmv_routine_id Specifies how to handle missing values. </span>
<a name="l00998"></a>00998 <span class="comment"> * 1 ignore, 2 explicit.</span>
<a name="l00999"></a>00999 <span class="comment"> * </span>
<a name="l01000"></a>01000 <span class="comment"> */</span>
<a name="l01001"></a>01001 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__find_best_split
<a name="l01002"></a>01002 (
<a name="l01003"></a>01003 table_name TEXT,
<a name="l01004"></a>01004 confidence_level FLOAT,
<a name="l01005"></a>01005 feature_table_name TEXT,
<a name="l01006"></a>01006 split_criterion INT,
<a name="l01007"></a>01007 continue_grow INT,
<a name="l01008"></a>01008 output_table TEXT,
<a name="l01009"></a>01009 h2hmv_routine_id INT,
<a name="l01010"></a>01010 num_classes INT
<a name="l01011"></a>01011 )
<a name="l01012"></a>01012 RETURNS VOID AS $$
<a name="l01013"></a>01013 DECLARE
<a name="l01014"></a>01014 total_size INT;
<a name="l01015"></a>01015 curstmt TEXT := &#39;&#39;;
<a name="l01016"></a>01016 begin_func_exec TIMESTAMP;
<a name="l01017"></a>01017 select_stmt TEXT;
<a name="l01018"></a>01018 BEGIN
<a name="l01019"></a>01019 begin_func_exec = clock_timestamp();
<a name="l01020"></a>01020
<a name="l01021"></a>01021 IF (h2hmv_routine_id=1) THEN
<a name="l01022"></a>01022 -- For ignore, we need the true size of nodes to handle the missing values.
<a name="l01023"></a>01023 select_stmt =
<a name="l01024"></a>01024 &#39;SELECT t1.tid, t1.nid, t1.fid, t1.total, t2.node_size::BIGINT
<a name="l01025"></a>01025 FROM
<a name="l01026"></a>01026 (
<a name="l01027"></a>01027 SELECT tid, nid, fid,
<a name="l01028"></a>01028 m4_ifdef(`__GREENPLUM__&#39;, `sum(count)&#39;, `MADLIB_SCHEMA.__bigint_array_sum(count)&#39;) as total
<a name="l01029"></a>01029 FROM training_instance_aux
<a name="l01030"></a>01030 GROUP BY tid, nid, fid
<a name="l01031"></a>01031 ) t1 INNER JOIN node_size_aux t2
<a name="l01032"></a>01032 ON t1.tid=t2.tid AND t1.nid=t2.nid&#39;;
<a name="l01033"></a>01033 ELSE
<a name="l01034"></a>01034 -- For explicit, the calculated node size from the aggregation is correct.
<a name="l01035"></a>01035 -- We can set NULL, which denotes we can safely use the counted value.
<a name="l01036"></a>01036 select_stmt =
<a name="l01037"></a>01037 &#39;SELECT tid, nid, fid,
<a name="l01038"></a>01038 m4_ifdef(`__GREENPLUM__&#39;, `sum(count)&#39;, `MADLIB_SCHEMA.__bigint_array_sum(count)&#39;) as total,
<a name="l01039"></a>01039 NULL::BIGINT AS node_size
<a name="l01040"></a>01040 FROM training_instance_aux
<a name="l01041"></a>01041 GROUP BY tid, nid, fid&#39;;
<a name="l01042"></a>01042 END IF;
<a name="l01043"></a>01043
<a name="l01044"></a>01044 <span class="comment">/*</span>
<a name="l01045"></a>01045 <span class="comment"> * This table is used to store information for the calculated best split </span>
<a name="l01046"></a>01046 <span class="comment"> *</span>
<a name="l01047"></a>01047 <span class="comment"> * tid The ID of the tree.</span>
<a name="l01048"></a>01048 <span class="comment"> * node_id The ID of one node in the specified tree.</span>
<a name="l01049"></a>01049 <span class="comment"> * feature The ID of the selected feature.</span>
<a name="l01050"></a>01050 <span class="comment"> * probability The predicted probability of our chosen class.</span>
<a name="l01051"></a>01051 <span class="comment"> * max_class The ID of the class chosen by the algorithm.</span>
<a name="l01052"></a>01052 <span class="comment"> * max_scv The maximum split criterion value.</span>
<a name="l01053"></a>01053 <span class="comment"> * live 1- For the chosen split, we should split further.</span>
<a name="l01054"></a>01054 <span class="comment"> * 0- For the chosen split, we shouldn&#39;t split further.</span>
<a name="l01055"></a>01055 <span class="comment"> * ebp_coeff total error for error-based pruning.</span>
<a name="l01056"></a>01056 <span class="comment"> * is_cont whether the selected feature is continuous.</span>
<a name="l01057"></a>01057 <span class="comment"> * split_value If the selected feature is continuous, it specifies</span>
<a name="l01058"></a>01058 <span class="comment"> * the split value. Otherwise, it is of no use.</span>
<a name="l01059"></a>01059 <span class="comment"> * distinct_features The number of distinct values for the selected feature.</span>
<a name="l01060"></a>01060 <span class="comment"> * node_size The size of this tree node. </span>
<a name="l01061"></a>01061 <span class="comment"> *</span>
<a name="l01062"></a>01062 <span class="comment"> */</span>
<a name="l01063"></a>01063 EXECUTE &#39;DROP TABLE IF EXISTS &#39;||output_table;
<a name="l01064"></a>01064 EXECUTE &#39;CREATE TEMP TABLE &#39;||output_table||&#39;
<a name="l01065"></a>01065 (
<a name="l01066"></a>01066 tid INT,
<a name="l01067"></a>01067 node_id INT,
<a name="l01068"></a>01068 feature INT,
<a name="l01069"></a>01069 probability FLOAT,
<a name="l01070"></a>01070 max_class INTEGER,
<a name="l01071"></a>01071 max_scv FLOAT,
<a name="l01072"></a>01072 live INT,
<a name="l01073"></a>01073 ebp_coeff FLOAT,
<a name="l01074"></a>01074 is_cont BOOLEAN,
<a name="l01075"></a>01075 split_value FLOAT,
<a name="l01076"></a>01076 distinct_features INT,
<a name="l01077"></a>01077 node_size INT
<a name="l01078"></a>01078 ) m4_ifdef(`__GREENPLUM__&#39;, `DISTRIBUTED BY (node_id)&#39;);&#39;;
<a name="l01079"></a>01079
<a name="l01080"></a>01080
<a name="l01081"></a>01081 EXECUTE &#39;DROP TABLE IF EXISTS tmp_best_table&#39;;
<a name="l01082"></a>01082
<a name="l01083"></a>01083 SELECT MADLIB_SCHEMA.__format
<a name="l01084"></a>01084 (
<a name="l01085"></a>01085 &#39;INSERT INTO %
<a name="l01086"></a>01086 SELECT tid, nid, best_scv[6], best_scv[4], best_scv[3], best_scv[1],
<a name="l01087"></a>01087 CASE WHEN (best_scv[1] &lt; 1e-9 OR
<a name="l01088"></a>01088 best_scv[4] &gt; 1-1e-9 OR % &lt;= 0 ) THEN
<a name="l01089"></a>01089 0
<a name="l01090"></a>01090 ELSE
<a name="l01091"></a>01091 1
<a name="l01092"></a>01092 END AS live,
<a name="l01093"></a>01093 MADLIB_SCHEMA.__ebp_calc_errors
<a name="l01094"></a>01094 (best_scv[5], best_scv[4], %) AS ebp_coeff,
<a name="l01095"></a>01095 o2.is_cont,
<a name="l01096"></a>01096 CASE WHEN( o2.is_cont ) THEN
<a name="l01097"></a>01097 best_scv[7]
<a name="l01098"></a>01098 ELSE
<a name="l01099"></a>01099 NULL
<a name="l01100"></a>01100 END AS split_value,
<a name="l01101"></a>01101 o2.num_dist_value, best_scv[5]
<a name="l01102"></a>01102 FROM
<a name="l01103"></a>01103 (
<a name="l01104"></a>01104 SELECT s1.tid, s1.nid,
<a name="l01105"></a>01105 MADLIB_SCHEMA.__best_scv_aggr(scv, s1.fid,
<a name="l01106"></a>01106 coalesce(s1.split_value,0)) as best_scv
<a name="l01107"></a>01107 FROM (
<a name="l01108"></a>01108 SELECT t1.tid, t1.nid, t1.fid, split_value,
<a name="l01109"></a>01109 MADLIB_SCHEMA.__scv_aggr
<a name="l01110"></a>01110 (%, is_cont, %, le, total, t2.node_size) AS scv
<a name="l01111"></a>01111 FROM
<a name="l01112"></a>01112 (
<a name="l01113"></a>01113 SELECT tid, nid, fid, fval, is_cont,
<a name="l01114"></a>01114 CASE WHEN (is_cont) THEN
<a name="l01115"></a>01115 fval
<a name="l01116"></a>01116 ELSE
<a name="l01117"></a>01117 NULL::FLOAT8
<a name="l01118"></a>01118 END AS split_value,
<a name="l01119"></a>01119 CASE WHEN (is_cont) THEN
<a name="l01120"></a>01120 m4_ifdef(`__GREENPLUM__&#39;, `sum(count)&#39;, `MADLIB_SCHEMA.__bigint_array_sum(count)&#39;) OVER
<a name="l01121"></a>01121 (PARTITION BY tid, nid, fid ORDER BY fval
<a name="l01122"></a>01122 ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)
<a name="l01123"></a>01123 ELSE
<a name="l01124"></a>01124 count
<a name="l01125"></a>01125 END AS le
<a name="l01126"></a>01126 FROM training_instance_aux
<a name="l01127"></a>01127 ) t1,
<a name="l01128"></a>01128 (
<a name="l01129"></a>01129 %
<a name="l01130"></a>01130 ) t2
<a name="l01131"></a>01131 WHERE t1.tid = t2.tid AND t1.nid = t2.nid AND t1.fid = t2.fid
<a name="l01132"></a>01132 GROUP BY t1.tid, t1.nid, t1.fid, split_value
<a name="l01133"></a>01133 ) s1
<a name="l01134"></a>01134 GROUP BY s1.tid, s1.nid
<a name="l01135"></a>01135 ) o1 INNER JOIN % o2 ON o1.best_scv[6]::INT=o2.<span class="keywordtype">id</span>&#39;,
<a name="l01136"></a>01136 ARRAY[
<a name="l01137"></a>01137 output_table,
<a name="l01138"></a>01138 continue_grow::TEXT,
<a name="l01139"></a>01139 confidence_level::TEXT,
<a name="l01140"></a>01140 split_criterion::TEXT,
<a name="l01141"></a>01141 num_classes::TEXT,
<a name="l01142"></a>01142 select_stmt,
<a name="l01143"></a>01143 feature_table_name
<a name="l01144"></a>01144 ]
<a name="l01145"></a>01145 ) INTO curstmt;
<a name="l01146"></a>01146
<a name="l01147"></a>01147 EXECUTE curstmt;
<a name="l01148"></a>01148
<a name="l01149"></a>01149 RETURN;
<a name="l01150"></a>01150 END
<a name="l01151"></a>01151 $$ LANGUAGE PLPGSQL;
<a name="l01152"></a>01152
<a name="l01153"></a>01153
<a name="l01154"></a>01154 <span class="comment">/*</span>
<a name="l01155"></a>01155 <span class="comment"> * @brief For training one decision tree, we need some internal tables</span>
<a name="l01156"></a>01156 <span class="comment"> * to store intermediate results. This function creates those</span>
<a name="l01157"></a>01157 <span class="comment"> * tables. Moreover, this function also creates the tree table</span>
<a name="l01158"></a>01158 <span class="comment"> * specified by user.</span>
<a name="l01159"></a>01159 <span class="comment"> *</span>
<a name="l01160"></a>01160 <span class="comment"> * @param result_tree_table_name The name of the tree specified by user. </span>
<a name="l01161"></a>01161 <span class="comment"> * </span>
<a name="l01162"></a>01162 <span class="comment"> */</span>
<a name="l01163"></a>01163 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__create_tree_tables
<a name="l01164"></a>01164 (
<a name="l01165"></a>01165 result_tree_table_name TEXT
<a name="l01166"></a>01166 )
<a name="l01167"></a>01167 RETURNS <span class="keywordtype">void</span> AS $$
<a name="l01168"></a>01168 BEGIN
<a name="l01169"></a>01169 -- The table of node_size_aux records the size of each node. It is used
<a name="l01170"></a>01170 -- for missing value handling.
<a name="l01171"></a>01171 DROP TABLE IF EXISTS node_size_aux CASCADE;
<a name="l01172"></a>01172 CREATE TEMP TABLE node_size_aux
<a name="l01173"></a>01173 (
<a name="l01174"></a>01174 tid INT,
<a name="l01175"></a>01175 nid INT,
<a name="l01176"></a>01176 node_size INT
<a name="l01177"></a>01177 )m4_ifdef(`__GREENPLUM__&#39;, `DISTRIBUTED BY (tid,nid)&#39;);
<a name="l01178"></a>01178
<a name="l01179"></a>01179 -- The table below stores the decision tree information just constructed.
<a name="l01180"></a>01180 -- Columns:
<a name="l01181"></a>01181 -- <span class="keywordtype">id</span>: The ID of the node represented by this row. Tree
<a name="l01182"></a>01182 -- node IDs are unique across all trees. The IDs of
<a name="l01183"></a>01183 -- all children of a node is made to be continuous.
<a name="l01184"></a>01184 -- tree_location: An array containing the encoded values of all the
<a name="l01185"></a>01185 -- features on the path from the root node to the
<a name="l01186"></a>01186 -- current node. For the root node, the location
<a name="l01187"></a>01187 -- value is {0}.
<a name="l01188"></a>01188 -- feature: The ID of the best split feature chosen <span class="keywordflow">for</span> the
<a name="l01189"></a>01189 -- node represented by <span class="keyword">this</span> row.
<a name="l01190"></a>01190 -- probability: If forced to make a call <span class="keywordflow">for</span> a dominant <span class="keyword">class </span>
<a name="l01191"></a>01191 -- at a given point this would be the confidence of the
<a name="l01192"></a>01192 -- call (this is only an estimated value).
<a name="l01193"></a>01193 -- ebp_coeff: The total errors used by error based pruning (ebp)
<a name="l01194"></a>01194 -- based on the specified confidence level. RF does
<a name="l01195"></a>01195 -- not do EBP therefore for RF nodes, this column always
<a name="l01196"></a>01196 -- contains 1.
<a name="l01197"></a>01197 -- max_class: If forced to make a call for a dominant class
<a name="l01198"></a>01198 -- at a given point this is the selected class.
<a name="l01199"></a>01199 -- scv: The splitting criteria value (scv) computed at this node.
<a name="l01200"></a>01200 -- live: Specifies whether the node should be further split
<a name="l01201"></a>01201 -- or not. A positive value indicates further split of
<a name="l01202"></a>01202 -- the node represented by this row is needed.
<a name="l01203"></a>01203 -- num_of_samples: The number of samples at this node.
<a name="l01204"></a>01204 -- parent_id: Id of the parent branch.
<a name="l01205"></a>01205 -- lmc_nid: Leftmost child (lmc) node id of the node represented
<a name="l01206"></a>01206 -- by the current row.
<a name="l01207"></a>01207 -- lmc_fval: The feature value which leads to the lmc node.
<a name="l01208"></a>01208 -- An example of getting all the child nodes&#39; ids
<a name="l01209"></a>01209 -- and condition values
<a name="l01210"></a>01210 -- 1. Get the right most node id
<a name="l01211"></a>01211 -- SELECT DISTINCT ON(parent_id) id FROM tree_table
<a name="l01212"></a>01212 -- WHERE parent_id = $pid ORDER BY parent_id, id desc
<a name="l01213"></a>01213 -- INTO max_child_nid;
<a name="l01214"></a>01214 -- 2. Get child nodes&#39; ids and condition values by a
<a name="l01215"></a>01215 -- while loop
<a name="l01216"></a>01216 -- node_count = 1;
<a name="l01217"></a>01217 -- WHILE (lmc_nid IS NOT NULL) AND
<a name="l01218"></a>01218 -- (0 &lt; node_count AND lmc_nid &lt;= max_child_nid) LOOP
<a name="l01219"></a>01219 -- ...
<a name="l01220"></a>01220 -- lmc_nid = lmc_nid + 1;
<a name="l01221"></a>01221 -- lmc_fval = lmc_fval + 1;
<a name="l01222"></a>01222 -- SELECT COUNT(id) FROM tree_table
<a name="l01223"></a>01223 -- WHERE id = $lmc_nid AND parent_id = $pid
<a name="l01224"></a>01224 -- INTO node_count;
<a name="l01225"></a>01225 -- END LOOP;
<a name="l01226"></a>01226 -- is_cont: It specifies whether the selected feature is a
<a name="l01227"></a>01227 -- continuous feature.
<a name="l01228"></a>01228 -- split_value: For continuous feature, it specifies the split value.
<a name="l01229"></a>01229 -- Otherwise, it is of no meaning and fixed to 0.
<a name="l01230"></a>01230 -- tid: The id of a tree that this node belongs to.
<a name="l01231"></a>01231 -- dp_ids: An array containing the IDs of the non-continuous
<a name="l01232"></a>01232 -- features chosen by all ancestors nodes (starting
<a name="l01233"></a>01233 -- from the root) for splitting.
<a name="l01234"></a>01234 --
<a name="l01235"></a>01235 -- The table below stores the final decision tree information.
<a name="l01236"></a>01236 -- It is an the table specified by users.
<a name="l01237"></a>01237 -- Please refer the table above for detailed column definition.
<a name="l01238"></a>01238 EXECUTE &#39;DROP TABLE IF EXISTS &#39;||result_tree_table_name||&#39; CASCADE;&#39;;
<a name="l01239"></a>01239 EXECUTE &#39;CREATE TABLE &#39;||result_tree_table_name||&#39;
<a name="l01240"></a>01240 (
<a name="l01241"></a>01241 id INT,
<a name="l01242"></a>01242 tree_location INT[],
<a name="l01243"></a>01243 feature INT,
<a name="l01244"></a>01244 probability FLOAT,
<a name="l01245"></a>01245 ebp_coeff FLOAT,
<a name="l01246"></a>01246 max_class INTEGER,
<a name="l01247"></a>01247 scv FLOAT,
<a name="l01248"></a>01248 live INT,
<a name="l01249"></a>01249 num_of_samples INT,
<a name="l01250"></a>01250 parent_id INT,
<a name="l01251"></a>01251 lmc_nid INT,
<a name="l01252"></a>01252 lmc_fval INT,
<a name="l01253"></a>01253 is_cont BOOLEAN,
<a name="l01254"></a>01254 split_value FLOAT,
<a name="l01255"></a>01255 tid INT,
<a name="l01256"></a>01256 dp_ids INT[]
<a name="l01257"></a>01257 ) m4_ifdef(`__GREENPLUM__&#39;, `DISTRIBUTED BY (tid,id)&#39;);&#39;;
<a name="l01258"></a>01258
<a name="l01259"></a>01259 -- The following table stored the auxiliary information for updating the
<a name="l01260"></a>01260 -- association table, so that the updating operation only need to
<a name="l01261"></a>01261 -- join the encoded table with association table once
<a name="l01262"></a>01262 EXECUTE &#39;DROP TABLE IF EXISTS assoc_aux CASCADE&#39;;
<a name="l01263"></a>01263 CREATE TEMP TABLE assoc_aux
<a name="l01264"></a>01264 (
<a name="l01265"></a>01265 nid INT,
<a name="l01266"></a>01266 fid INT,
<a name="l01267"></a>01267 lmc_id INT,
<a name="l01268"></a>01268 svalue FLOAT,
<a name="l01269"></a>01269 is_cont BOOL
<a name="l01270"></a>01270 ) m4_ifdef(`__GREENPLUM__&#39;, `DISTRIBUTED BY (nid)&#39;);
<a name="l01271"></a>01271
<a name="l01272"></a>01272 EXECUTE &#39;DROP TABLE IF EXISTS tr_assoc_ping CASCADE&#39;;
<a name="l01273"></a>01273 EXECUTE &#39;DROP TABLE IF EXISTS tr_assoc_pong CASCADE&#39;;
<a name="l01274"></a>01274 EXECUTE &#39;DROP TABLE IF EXISTS sf_assoc CASCADE&#39;;
<a name="l01275"></a>01275
<a name="l01276"></a>01276 m4_changequote(`&gt;&gt;&gt;&#39;, `&lt;&lt;&lt;&#39;)
<a name="l01277"></a>01277 m4_ifdef(&gt;&gt;&gt;__GREENPLUM_GE_4_2_1__&lt;&lt;&lt;, &gt;&gt;&gt;
<a name="l01278"></a>01278 CREATE TEMP TABLE tr_assoc_ping
<a name="l01279"></a>01279 (
<a name="l01280"></a>01280 id BIGINT ENCODING (compresstype=RLE_TYPE),
<a name="l01281"></a>01281 nid INT ENCODING (compresstype=RLE_TYPE),
<a name="l01282"></a>01282 tid INT ENCODING (compresstype=RLE_TYPE),
<a name="l01283"></a>01283 weight INT ENCODING (compresstype=RLE_TYPE)
<a name="l01284"></a>01284 )
<a name="l01285"></a>01285 WITH(appendonly=true, orientation=column)
<a name="l01286"></a>01286 DISTRIBUTED BY(id);
<a name="l01287"></a>01287
<a name="l01288"></a>01288 CREATE TEMP TABLE tr_assoc_pong
<a name="l01289"></a>01289 (
<a name="l01290"></a>01290 id BIGINT ENCODING (compresstype=RLE_TYPE),
<a name="l01291"></a>01291 nid INT ENCODING (compresstype=RLE_TYPE),
<a name="l01292"></a>01292 tid INT ENCODING (compresstype=RLE_TYPE),
<a name="l01293"></a>01293 weight INT ENCODING (compresstype=RLE_TYPE)
<a name="l01294"></a>01294 )
<a name="l01295"></a>01295 WITH(appendonly=true, orientation=column)
<a name="l01296"></a>01296 DISTRIBUTED BY(id);
<a name="l01297"></a>01297
<a name="l01298"></a>01298 CREATE TEMP TABLE sf_assoc
<a name="l01299"></a>01299 (
<a name="l01300"></a>01300 nid INT ENCODING (compresstype=RLE_TYPE),
<a name="l01301"></a>01301 fid INT ENCODING (compresstype=RLE_TYPE)
<a name="l01302"></a>01302 )
<a name="l01303"></a>01303 WITH(appendonly=true, orientation=column)
<a name="l01304"></a>01304 DISTRIBUTED BY(fid);
<a name="l01305"></a>01305 &lt;&lt;&lt;, &gt;&gt;&gt;
<a name="l01306"></a>01306 CREATE TEMP TABLE tr_assoc_ping
<a name="l01307"></a>01307 (
<a name="l01308"></a>01308 id BIGINT,
<a name="l01309"></a>01309 nid INT,
<a name="l01310"></a>01310 tid INT,
<a name="l01311"></a>01311 weight INT
<a name="l01312"></a>01312 )m4_ifdef(`__GREENPLUM__&#39;, `DISTRIBUTED BY (id)&#39;);
<a name="l01313"></a>01313 CREATE TEMP TABLE tr_assoc_pong
<a name="l01314"></a>01314 (
<a name="l01315"></a>01315 id BIGINT,
<a name="l01316"></a>01316 nid INT,
<a name="l01317"></a>01317 tid INT,
<a name="l01318"></a>01318 weight INT
<a name="l01319"></a>01319 )m4_ifdef(`__GREENPLUM__&#39;, `DISTRIBUTED BY (id)&#39;);
<a name="l01320"></a>01320 CREATE TEMP TABLE sf_assoc
<a name="l01321"></a>01321 (
<a name="l01322"></a>01322 nid INT,
<a name="l01323"></a>01323 fid INT
<a name="l01324"></a>01324 )m4_ifdef(`__GREENPLUM__&#39;, `DISTRIBUTED BY (fid)&#39;);
<a name="l01325"></a>01325 &lt;&lt;&lt;)
<a name="l01326"></a>01326 m4_changequote(&gt;&gt;&gt;`&lt;&lt;&lt;, &gt;&gt;&gt;&#39;&lt;&lt;&lt;)
<a name="l01327"></a>01327 END
<a name="l01328"></a>01328 $$ LANGUAGE PLPGSQL;
<a name="l01329"></a>01329
<a name="l01330"></a>01330
<a name="l01331"></a>01331 <span class="comment">/*</span>
<a name="l01332"></a>01332 <span class="comment"> * @brief Prune the trained tree with &quot;Reduced Error Pruning&quot; algorithm.</span>
<a name="l01333"></a>01333 <span class="comment"> *</span>
<a name="l01334"></a>01334 <span class="comment"> * @param tree_table_name The name of the table containing the tree. </span>
<a name="l01335"></a>01335 <span class="comment"> * @param validation_table The name of the table containing validation set. </span>
<a name="l01336"></a>01336 <span class="comment"> * @param max_num_classes The count of different classes. </span>
<a name="l01337"></a>01337 <span class="comment"> * </span>
<a name="l01338"></a>01338 <span class="comment"> */</span>
<a name="l01339"></a>01339 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__rep_prune_tree
<a name="l01340"></a>01340 (
<a name="l01341"></a>01341 tree_table_name TEXT,
<a name="l01342"></a>01342 validation_table TEXT,
<a name="l01343"></a>01343 max_num_classes INT
<a name="l01344"></a>01344 )
<a name="l01345"></a>01345 RETURNS void AS $$
<a name="l01346"></a>01346 DECLARE
<a name="l01347"></a>01347 num_parent_ids INTEGER;
<a name="l01348"></a>01348 cf_table_name TEXT;
<a name="l01349"></a>01349 encoded_table_name TEXT;
<a name="l01350"></a>01350 metatable_name TEXT;
<a name="l01351"></a>01351 curstmt TEXT;
<a name="l01352"></a>01352 id_col_name TEXT;
<a name="l01353"></a>01353 class_col_name TEXT;
<a name="l01354"></a>01354 classify_result TEXT;
<a name="l01355"></a>01355 temp_text TEXT;
<a name="l01356"></a>01356 n INT;
<a name="l01357"></a>01357 table_names TEXT[];
<a name="l01358"></a>01358 BEGIN
<a name="l01359"></a>01359 metatable_name = MADLIB_SCHEMA.__get_metatable_name(tree_table_name);
<a name="l01360"></a>01360 id_col_name = MADLIB_SCHEMA.__get_id_column_name(metatable_name);
<a name="l01361"></a>01361 class_col_name = MADLIB_SCHEMA.__get_class_column_name(metatable_name);
<a name="l01362"></a>01362
<a name="l01363"></a>01363 -- the value of class column in validation table must in the KV table
<a name="l01364"></a>01364 SELECT MADLIB_SCHEMA.__format
<a name="l01365"></a>01365 (
<a name="l01366"></a>01366 &#39;SELECT COUNT(*)
<a name="l01367"></a>01367 FROM %
<a name="l01368"></a>01368 WHERE MADLIB_SCHEMA.__to_char(%) NOT IN
<a name="l01369"></a>01369 (SELECT fval FROM % WHERE fval IS NOT NULL)&#39;,
<a name="l01370"></a>01370 ARRAY[
<a name="l01371"></a>01371 validation_table,
<a name="l01372"></a>01372 class_col_name,
<a name="l01373"></a>01373 MADLIB_SCHEMA.__get_classtable_name(metatable_name)
<a name="l01374"></a>01374 ]
<a name="l01375"></a>01375 )
<a name="l01376"></a>01376 INTO curstmt;
<a name="l01377"></a>01377
<a name="l01378"></a>01378 EXECUTE curstmt INTO n;
<a name="l01379"></a>01379
<a name="l01380"></a>01380 PERFORM MADLIB_SCHEMA.__assert
<a name="l01381"></a>01381 (
<a name="l01382"></a>01382 n = 0,
<a name="l01383"></a>01383 &#39;the value of class column in validation table must in
<a name="l01384"></a>01384 training table&#39;
<a name="l01385"></a>01385 );
<a name="l01386"></a>01386
<a name="l01387"></a>01387 table_names = MADLIB_SCHEMA.__treemodel_classify_internal
<a name="l01388"></a>01388 (
<a name="l01389"></a>01389 validation_table,
<a name="l01390"></a>01390 tree_table_name,
<a name="l01391"></a>01391 0
<a name="l01392"></a>01392 );
<a name="l01393"></a>01393
<a name="l01394"></a>01394 encoded_table_name = table_names[1];
<a name="l01395"></a>01395 classify_result = table_names[2];
<a name="l01396"></a>01396 cf_table_name = classify_result;
<a name="l01397"></a>01397
<a name="l01398"></a>01398 -- after encoding in classification, class_col_name is fixed to class
<a name="l01399"></a>01399 class_col_name = &#39;class&#39;;
<a name="l01400"></a>01400
<a name="l01401"></a>01401 m4_changequote(`&gt;&gt;&gt;&#39;, `&lt;&lt;&lt;&#39;)
<a name="l01402"></a>01402 m4_ifdef(&gt;&gt;&gt;__GREENPLUM_PRE_4_1__&lt;&lt;&lt;, &gt;&gt;&gt;
<a name="l01403"></a>01403 EXECUTE &#39;DROP TABLE IF EXISTS tree_rep_pong CASCADE&#39;;
<a name="l01404"></a>01404 EXECUTE &#39;CREATE TEMP TABLE tree_rep_pong AS SELECT * FROM &#39; ||
<a name="l01405"></a>01405 classify_result ||
<a name="l01406"></a>01406 &#39; LIMIT 0 m4_ifdef(`__GREENPLUM__&#39;, `DISTRIBUTED BY (id)&#39;)&#39;;
<a name="l01407"></a>01407 &lt;&lt;&lt;)
<a name="l01408"></a>01408 m4_changequote(&gt;&gt;&gt;`&lt;&lt;&lt;, &gt;&gt;&gt;&#39;&lt;&lt;&lt;)
<a name="l01409"></a>01409
<a name="l01410"></a>01410 LOOP
<a name="l01411"></a>01411 DROP TABLE IF EXISTS selected_parent_ids_rep;
<a name="l01412"></a>01412 CREATE TEMP TABLE selected_parent_ids_rep
<a name="l01413"></a>01413 (
<a name="l01414"></a>01414 parent_id BIGINT,
<a name="l01415"></a>01415 max_class INT
<a name="l01416"></a>01416 ) m4_ifdef(`__GREENPLUM__&#39;, `DISTRIBUTED BY (parent_id)&#39;);
<a name="l01417"></a>01417
<a name="l01418"></a>01418 SELECT MADLIB_SCHEMA.__format
<a name="l01419"></a>01419 (
<a name="l01420"></a>01420 &#39;INSERT INTO selected_parent_ids_rep
<a name="l01421"></a>01421 SELECT parent_id, t.g[1] as max_class
<a name="l01422"></a>01422 FROM
<a name="l01423"></a>01423 (
<a name="l01424"></a>01424 SELECT parent_id,
<a name="l01425"></a>01425 MADLIB_SCHEMA.__rep_aggr_class_count
<a name="l01426"></a>01426 (
<a name="l01427"></a>01427 c.class,
<a name="l01428"></a>01428 s.%,
<a name="l01429"></a>01429 %
<a name="l01430"></a>01430 ) AS g
<a name="l01431"></a>01431 FROM % c, % s
<a name="l01432"></a>01432 WHERE c.id=s.%
<a name="l01433"></a>01433 GROUP BY parent_id
<a name="l01434"></a>01434 ) t
<a name="l01435"></a>01435 WHERE t.g[2] &gt;= 0 AND
<a name="l01436"></a>01436 t.parent_id IN
<a name="l01437"></a>01437 (
<a name="l01438"></a>01438 Select parent_id FROM %
<a name="l01439"></a>01439 WHERE parent_id NOT IN
<a name="l01440"></a>01440 (
<a name="l01441"></a>01441 Select parent_id
<a name="l01442"></a>01442 FROM %
<a name="l01443"></a>01443 WHERE lmc_nid IS NOT NULL
<a name="l01444"></a>01444 ) and id &lt;&gt; 1
<a name="l01445"></a>01445 );&#39;,
<a name="l01446"></a>01446 ARRAY[
<a name="l01447"></a>01447 class_col_name,
<a name="l01448"></a>01448 MADLIB_SCHEMA.__to_char(max_num_classes),
<a name="l01449"></a>01449 classify_result,
<a name="l01450"></a>01450 encoded_table_name,
<a name="l01451"></a>01451 id_col_name,
<a name="l01452"></a>01452 tree_table_name,
<a name="l01453"></a>01453 tree_table_name
<a name="l01454"></a>01454 ]
<a name="l01455"></a>01455 )
<a name="l01456"></a>01456 INTO curstmt;
<a name="l01457"></a>01457
<a name="l01458"></a>01458 EXECUTE curstmt;
<a name="l01459"></a>01459
<a name="l01460"></a>01460 EXECUTE &#39;SELECT parent_id FROM selected_parent_ids_rep limit 1;&#39;
<a name="l01461"></a>01461 INTO num_parent_ids;
<a name="l01462"></a>01462 IF (num_parent_ids IS NULL) THEN
<a name="l01463"></a>01463 EXIT;
<a name="l01464"></a>01464 END IF;
<a name="l01465"></a>01465
<a name="l01466"></a>01466 m4_changequote(`&gt;&gt;&gt;&#39;, `&lt;&lt;&lt;&#39;)
<a name="l01467"></a>01467 m4_ifdef(`__GREENPLUM_PRE_4_1__&#39;, &gt;&gt;&gt;
<a name="l01468"></a>01468 -- for some databases, update operation can&#39;t distribute data across segments
<a name="l01469"></a>01469 -- we use two tables to update the data
<a name="l01470"></a>01470 IF (classify_result = &#39;tree_rep_pong&#39;) THEN
<a name="l01471"></a>01471 temp_text = cf_table_name;
<a name="l01472"></a>01472 ELSE
<a name="l01473"></a>01473 temp_text = &#39;tree_rep_pong&#39;;
<a name="l01474"></a>01474 END IF;
<a name="l01475"></a>01475
<a name="l01476"></a>01476 EXECUTE &#39;TRUNCATE &#39; || temp_text;
<a name="l01477"></a>01477 SELECT MADLIB_SCHEMA.__format
<a name="l01478"></a>01478 (
<a name="l01479"></a>01479 &#39;INSERT INTO %(id, class, parent_id, leaf_id)
<a name="l01480"></a>01480 SELECT m.id, t.max_class, t.parent_id, t.id
<a name="l01481"></a>01481 FROM % m, % t
<a name="l01482"></a>01482 WHERE t.id IN (SELECT parent_id FROM selected_parent_ids_rep) AND
<a name="l01483"></a>01483 m.parent_id = t.id&#39;,
<a name="l01484"></a>01484 ARRAY[
<a name="l01485"></a>01485 temp_text,
<a name="l01486"></a>01486 classify_result,
<a name="l01487"></a>01487 tree_table_name
<a name="l01488"></a>01488 ]
<a name="l01489"></a>01489 )
<a name="l01490"></a>01490 INTO curstmt;
<a name="l01491"></a>01491
<a name="l01492"></a>01492 EXECUTE curstmt;
<a name="l01493"></a>01493
<a name="l01494"></a>01494 classify_result = temp_text;
<a name="l01495"></a>01495 &lt;&lt;&lt;, &gt;&gt;&gt;
<a name="l01496"></a>01496 SELECT MADLIB_SCHEMA.__format
<a name="l01497"></a>01497 (
<a name="l01498"></a>01498 &#39;UPDATE % m set class = t.max_class,
<a name="l01499"></a>01499 parent_id = t.parent_id,leaf_id = t.id
<a name="l01500"></a>01500 FROM % t
<a name="l01501"></a>01501 WHERE t.id IN (SELECT parent_id FROM selected_parent_ids_rep) AND
<a name="l01502"></a>01502 m.parent_id=t.id&#39;,
<a name="l01503"></a>01503 classify_result,
<a name="l01504"></a>01504 tree_table_name
<a name="l01505"></a>01505 )
<a name="l01506"></a>01506 INTO curstmt;
<a name="l01507"></a>01507 EXECUTE curstmt;
<a name="l01508"></a>01508 &lt;&lt;&lt;)
<a name="l01509"></a>01509 m4_changequote(&gt;&gt;&gt;`&lt;&lt;&lt;, &gt;&gt;&gt;&#39;&lt;&lt;&lt;)
<a name="l01510"></a>01510
<a name="l01511"></a>01511 SELECT MADLIB_SCHEMA.__format
<a name="l01512"></a>01512 (
<a name="l01513"></a>01513 &#39;DELETE FROM % WHERE parent_id IN
<a name="l01514"></a>01514 (SELECT parent_id FROM selected_parent_ids_rep)&#39;,
<a name="l01515"></a>01515 tree_table_name
<a name="l01516"></a>01516 )
<a name="l01517"></a>01517 INTO curstmt;
<a name="l01518"></a>01518
<a name="l01519"></a>01519 EXECUTE curstmt;
<a name="l01520"></a>01520
<a name="l01521"></a>01521 SELECT MADLIB_SCHEMA.__format
<a name="l01522"></a>01522 (
<a name="l01523"></a>01523 &#39;UPDATE % t1 SET lmc_nid = NULL,
<a name="l01524"></a>01524 lmc_fval = NULL, max_class = t2.max_class
<a name="l01525"></a>01525 FROM selected_parent_ids_rep t2
<a name="l01526"></a>01526 WHERE t1.id = t2.parent_id;&#39;,
<a name="l01527"></a>01527 tree_table_name
<a name="l01528"></a>01528 )
<a name="l01529"></a>01529 INTO curstmt;
<a name="l01530"></a>01530
<a name="l01531"></a>01531 EXECUTE curstmt;
<a name="l01532"></a>01532
<a name="l01533"></a>01533 END LOOP;
<a name="l01534"></a>01534
<a name="l01535"></a>01535 EXECUTE &#39;DROP TABLE IF EXISTS &#39; || encoded_table_name || &#39; CASCADE;&#39;;
<a name="l01536"></a>01536 END
<a name="l01537"></a>01537 $$ LANGUAGE PLPGSQL;
<a name="l01538"></a>01538
<a name="l01539"></a>01539
<a name="l01540"></a>01540 <span class="comment">/*</span>
<a name="l01541"></a>01541 <span class="comment"> * @brief Calculates the total errors used by Error Based Pruning (EBP).</span>
<a name="l01542"></a>01542 <span class="comment"> *</span>
<a name="l01543"></a>01543 <span class="comment"> * @param total The number of total samples represented by the node </span>
<a name="l01544"></a>01544 <span class="comment"> * being processed. </span>
<a name="l01545"></a>01545 <span class="comment"> * @param prob The probability to mis-classify samples represented by the </span>
<a name="l01546"></a>01546 <span class="comment"> * child nodes if they are pruned with EBP. </span>
<a name="l01547"></a>01547 <span class="comment"> * @param confidence_level A certainty factor to calculate the confidence limits</span>
<a name="l01548"></a>01548 <span class="comment"> * for the probability of error using the binomial theorem. </span>
<a name="l01549"></a>01549 <span class="comment"> * </span>
<a name="l01550"></a>01550 <span class="comment"> * @return The computed total error.</span>
<a name="l01551"></a>01551 <span class="comment"> *</span>
<a name="l01552"></a>01552 <span class="comment"> */</span>
<a name="l01553"></a>01553 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__ebp_calc_errors
<a name="l01554"></a>01554 (
<a name="l01555"></a>01555 total FLOAT8,
<a name="l01556"></a>01556 prob FLOAT8,
<a name="l01557"></a>01557 confidence_level FLOAT8
<a name="l01558"></a>01558 ) RETURNS FLOAT8
<a name="l01559"></a>01559 AS &#39;MODULE_PATHNAME&#39;, &#39;dt_ebp_calc_errors&#39;
<a name="l01560"></a>01560 LANGUAGE C STRICT IMMUTABLE;
<a name="l01561"></a>01561
<a name="l01562"></a>01562
<a name="l01563"></a>01563 <span class="comment">/*</span>
<a name="l01564"></a>01564 <span class="comment"> * @brief Prune the trained tree with &quot;Error-based Pruning&quot; algorithm.</span>
<a name="l01565"></a>01565 <span class="comment"> *</span>
<a name="l01566"></a>01566 <span class="comment"> * @param tree_table_name The name of the table containing the tree. </span>
<a name="l01567"></a>01567 <span class="comment"> * </span>
<a name="l01568"></a>01568 <span class="comment"> */</span>
<a name="l01569"></a>01569 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__ebp_prune_tree
<a name="l01570"></a>01570 (
<a name="l01571"></a>01571 tree_table_name TEXT
<a name="l01572"></a>01572 )
<a name="l01573"></a>01573 RETURNS void AS $$
<a name="l01574"></a>01574 DECLARE
<a name="l01575"></a>01575 num_parent_ids INTEGER;
<a name="l01576"></a>01576 curstmt TEXT;
<a name="l01577"></a>01577 BEGIN
<a name="l01578"></a>01578 LOOP
<a name="l01579"></a>01579 DROP TABLE IF EXISTS selected_parent_ids_ebp;
<a name="l01580"></a>01580 CREATE TEMP TABLE selected_parent_ids_ebp(parent_id BIGINT)
<a name="l01581"></a>01581 m4_ifdef(`__GREENPLUM__&#39;, `DISTRIBUTED BY(parent_id)&#39;);
<a name="l01582"></a>01582
<a name="l01583"></a>01583 SELECT MADLIB_SCHEMA.__format
<a name="l01584"></a>01584 (
<a name="l01585"></a>01585 &#39;INSERT INTO selected_parent_ids_ebp
<a name="l01586"></a>01586 SELECT s.parent_id as parent_id
<a name="l01587"></a>01587 FROM
<a name="l01588"></a>01588 (
<a name="l01589"></a>01589 Select parent_id, sum(ebp_coeff) as ebp_coeff
<a name="l01590"></a>01590 FROM
<a name="l01591"></a>01591 (
<a name="l01592"></a>01592 Select parent_id, ebp_coeff
<a name="l01593"></a>01593 FROM %
<a name="l01594"></a>01594 WHERE parent_id NOT IN
<a name="l01595"></a>01595 (
<a name="l01596"></a>01596 Select parent_id FROM % WHERE lmc_nid IS NOT NULL
<a name="l01597"></a>01597 ) and id &lt;&gt; 1
<a name="l01598"></a>01598 ) m
<a name="l01599"></a>01599 GROUP BY m.parent_id
<a name="l01600"></a>01600 ) s
<a name="l01601"></a>01601 LEFT JOIN % p
<a name="l01602"></a>01602 ON p.id = s.parent_id
<a name="l01603"></a>01603 WHERE p.ebp_coeff &lt; s.ebp_coeff;&#39;,
<a name="l01604"></a>01604 tree_table_name,
<a name="l01605"></a>01605 tree_table_name,
<a name="l01606"></a>01606 tree_table_name
<a name="l01607"></a>01607 )
<a name="l01608"></a>01608 INTO curstmt;
<a name="l01609"></a>01609
<a name="l01610"></a>01610 EXECUTE curstmt;
<a name="l01611"></a>01611
<a name="l01612"></a>01612 EXECUTE &#39;SELECT parent_id FROM selected_parent_ids_ebp LIMIT 1;&#39;
<a name="l01613"></a>01613 INTO num_parent_ids;
<a name="l01614"></a>01614
<a name="l01615"></a>01615 IF (num_parent_ids IS NULL) THEN
<a name="l01616"></a>01616 EXIT;
<a name="l01617"></a>01617 END IF;
<a name="l01618"></a>01618
<a name="l01619"></a>01619 SELECT MADLIB_SCHEMA.__format
<a name="l01620"></a>01620 (
<a name="l01621"></a>01621 &#39;DELETE FROM %
<a name="l01622"></a>01622 WHERE parent_id IN
<a name="l01623"></a>01623 (SELECT parent_id FROM selected_parent_ids_ebp)&#39;,
<a name="l01624"></a>01624 tree_table_name
<a name="l01625"></a>01625 )
<a name="l01626"></a>01626 INTO curstmt;
<a name="l01627"></a>01627
<a name="l01628"></a>01628 EXECUTE curstmt;
<a name="l01629"></a>01629
<a name="l01630"></a>01630 SELECT MADLIB_SCHEMA.__format
<a name="l01631"></a>01631 (
<a name="l01632"></a>01632 &#39;UPDATE %
<a name="l01633"></a>01633 SET lmc_nid = NULL, lmc_fval = NULL
<a name="l01634"></a>01634 WHERE id IN
<a name="l01635"></a>01635 (SELECT parent_id FROM selected_parent_ids_ebp)&#39;,
<a name="l01636"></a>01636 tree_table_name
<a name="l01637"></a>01637 )
<a name="l01638"></a>01638 INTO curstmt;
<a name="l01639"></a>01639
<a name="l01640"></a>01640 EXECUTE curstmt;
<a name="l01641"></a>01641
<a name="l01642"></a>01642 END LOOP;
<a name="l01643"></a>01643 END
<a name="l01644"></a>01644 $$ LANGUAGE PLPGSQL;
<a name="l01645"></a>01645
<a name="l01646"></a>01646
<a name="l01647"></a>01647 <span class="comment">/*</span>
<a name="l01648"></a>01648 <span class="comment"> * @brief Generate the final trained tree.</span>
<a name="l01649"></a>01649 <span class="comment"> *</span>
<a name="l01650"></a>01650 <span class="comment"> * @param result_tree_table_name The name of the table containing the tree.</span>
<a name="l01651"></a>01651 <span class="comment"> * </span>
<a name="l01652"></a>01652 <span class="comment"> */</span>
<a name="l01653"></a>01653 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__generate_final_tree
<a name="l01654"></a>01654 (
<a name="l01655"></a>01655 result_tree_table_name TEXT
<a name="l01656"></a>01656 )
<a name="l01657"></a>01657 RETURNS void AS $$
<a name="l01658"></a>01658 DECLARE
<a name="l01659"></a>01659 tree_size INTEGER;
<a name="l01660"></a>01660 curstmt TEXT;
<a name="l01661"></a>01661 num_redundant_nodes INTEGER;
<a name="l01662"></a>01662 BEGIN
<a name="l01663"></a>01663
<a name="l01664"></a>01664 EXECUTE &#39; DELETE FROM &#39; || result_tree_table_name ||
<a name="l01665"></a>01665 &#39; WHERE COALESCE(num_of_samples,0) = 0&#39;;
<a name="l01666"></a>01666
<a name="l01667"></a>01667 -- for each node, find the left most child node id and the feature value,
<a name="l01668"></a>01668 -- and update the node&#39;s lmc_nid and lmc_fval column
<a name="l01669"></a>01669 SELECT MADLIB_SCHEMA.__format
<a name="l01670"></a>01670 (
<a name="l01671"></a>01671 &#39;UPDATE % k
<a name="l01672"></a>01672 SET lmc_nid = g.lmc_nid, lmc_fval = g.lmc_fval
<a name="l01673"></a>01673 FROM
<a name="l01674"></a>01674 (
<a name="l01675"></a>01675 SELECT parent_id,
<a name="l01676"></a>01676 min(id) as lmc_nid,
<a name="l01677"></a>01677 min(tree_location[array_upper(tree_location,1)])
<a name="l01678"></a>01678 as lmc_fval
<a name="l01679"></a>01679 FROM %
<a name="l01680"></a>01680 GROUP BY parent_id
<a name="l01681"></a>01681 ) g
<a name="l01682"></a>01682 WHERE k.id = g.parent_id&#39;,
<a name="l01683"></a>01683 ARRAY[
<a name="l01684"></a>01684 result_tree_table_name,
<a name="l01685"></a>01685 result_tree_table_name
<a name="l01686"></a>01686 ]
<a name="l01687"></a>01687 )
<a name="l01688"></a>01688 INTO curstmt;
<a name="l01689"></a>01689 EXECUTE curstmt;
<a name="l01690"></a>01690
<a name="l01691"></a>01691 <span class="comment">/*</span>
<a name="l01692"></a>01692 <span class="comment"> * For a certain node, if all of its children are leaf nodes and have the </span>
<a name="l01693"></a>01693 <span class="comment"> * same class label, we can safely remove its children. After removal, we</span>
<a name="l01694"></a>01694 <span class="comment"> * should apply the same operation to the new leaf nodes until no nodes </span>
<a name="l01695"></a>01695 <span class="comment"> * meet this criterion.</span>
<a name="l01696"></a>01696 <span class="comment"> */</span>
<a name="l01697"></a>01697 LOOP
<a name="l01698"></a>01698 EXECUTE &#39;DROP TABLE IF EXISTS trim_tree_aux_table CASCADE&#39;;
<a name="l01699"></a>01699 -- Find nodes whose children should be removed.
<a name="l01700"></a>01700 curstmt = MADLIB_SCHEMA.__format
<a name="l01701"></a>01701 (
<a name="l01702"></a>01702 &#39;CREATE TEMP TABLE trim_tree_aux_table AS
<a name="l01703"></a>01703 SELECT parent_id FROM
<a name="l01704"></a>01704 (
<a name="l01705"></a>01705 SELECT parent_id, count(distinct max_class) as class_count
<a name="l01706"></a>01706 FROM %
<a name="l01707"></a>01707 WHERE parent_id IN
<a name="l01708"></a>01708 (
<a name="l01709"></a>01709 SELECT parent_id FROM %
<a name="l01710"></a>01710 WHERE parent_id NOT IN
<a name="l01711"></a>01711 (
<a name="l01712"></a>01712 SELECT parent_id
<a name="l01713"></a>01713 FROM %
<a name="l01714"></a>01714 WHERE lmc_nid IS NOT NULL
<a name="l01715"></a>01715 ) and parent_id &lt;&gt; 0
<a name="l01716"></a>01716 )
<a name="l01717"></a>01717 GROUP BY parent_id
<a name="l01718"></a>01718 ) l
<a name="l01719"></a>01719 where l.class_count=1
<a name="l01720"></a>01720 m4_ifdef(`__GREENPLUM__&#39;, `DISTRIBUTED BY (parent_id)&#39;)&#39;,
<a name="l01721"></a>01721 ARRAY[
<a name="l01722"></a>01722 result_tree_table_name,
<a name="l01723"></a>01723 result_tree_table_name,
<a name="l01724"></a>01724 result_tree_table_name
<a name="l01725"></a>01725 ]
<a name="l01726"></a>01726 );
<a name="l01727"></a>01727 EXECUTE curstmt;
<a name="l01728"></a>01728
<a name="l01729"></a>01729 EXECUTE &#39;SELECT count(*) FROM trim_tree_aux_table&#39;
<a name="l01730"></a>01730 INTO num_redundant_nodes;
<a name="l01731"></a>01731
<a name="l01732"></a>01732 IF (num_redundant_nodes &lt;= 0) THEN
<a name="l01733"></a>01733 EXIT;
<a name="l01734"></a>01734 END IF;
<a name="l01735"></a>01735
<a name="l01736"></a>01736 -- Delete the found redundant nodes.
<a name="l01737"></a>01737 curstmt = MADLIB_SCHEMA.__format
<a name="l01738"></a>01738 (
<a name="l01739"></a>01739 &#39;
<a name="l01740"></a>01740 DELETE FROM % t
<a name="l01741"></a>01741 WHERE t.parent_id IN
<a name="l01742"></a>01742 (SELECT parent_id FROM trim_tree_aux_table)&#39;,
<a name="l01743"></a>01743 ARRAY[
<a name="l01744"></a>01744 result_tree_table_name
<a name="l01745"></a>01745 ]
<a name="l01746"></a>01746 );
<a name="l01747"></a>01747 EXECUTE curstmt;
<a name="l01748"></a>01748
<a name="l01749"></a>01749 -- Set the nodes, whose children are removed, to be leaf nodes.
<a name="l01750"></a>01750 curstmt = MADLIB_SCHEMA.__format
<a name="l01751"></a>01751 (
<a name="l01752"></a>01752 &#39;UPDATE % k
<a name="l01753"></a>01753 SET lmc_nid = NULL, lmc_fval = NULL
<a name="l01754"></a>01754 FROM
<a name="l01755"></a>01755 (
<a name="l01756"></a>01756 SELECT parent_id FROM trim_tree_aux_table
<a name="l01757"></a>01757 ) g
<a name="l01758"></a>01758 WHERE k.id = g.parent_id&#39;,
<a name="l01759"></a>01759 ARRAY[
<a name="l01760"></a>01760 result_tree_table_name
<a name="l01761"></a>01761 ]
<a name="l01762"></a>01762 );
<a name="l01763"></a>01763 EXECUTE curstmt;
<a name="l01764"></a>01764 END LOOP;
<a name="l01765"></a>01765 END
<a name="l01766"></a>01766 $$ LANGUAGE PLPGSQL;
<a name="l01767"></a>01767
<a name="l01768"></a>01768
<a name="l01769"></a>01769 <span class="comment">/*</span>
<a name="l01770"></a>01770 <span class="comment"> * The UDT for the training result.</span>
<a name="l01771"></a>01771 <span class="comment"> *</span>
<a name="l01772"></a>01772 <span class="comment"> * num_of_samples It means how many records there exists in the </span>
<a name="l01773"></a>01773 <span class="comment"> * training set. </span>
<a name="l01774"></a>01774 <span class="comment"> * features_per_node The number of features chosen for each tree.</span>
<a name="l01775"></a>01775 <span class="comment"> * num_tree_nodes The number of tree nodes.</span>
<a name="l01776"></a>01776 <span class="comment"> * max_tree_depth The max tree depth.</span>
<a name="l01777"></a>01777 <span class="comment"> * calc_acc_time Total time of calculating acc.</span>
<a name="l01778"></a>01778 <span class="comment"> * calc_pre_time Time of preprocessing when calculating acc.</span>
<a name="l01779"></a>01779 <span class="comment"> * update_time Total time of updating operation after found</span>
<a name="l01780"></a>01780 <span class="comment"> * the best time. </span>
<a name="l01781"></a>01781 <span class="comment"> * update_best Time of updating the best splits&#39; information.</span>
<a name="l01782"></a>01782 <span class="comment"> * update_child Time of generating the child nodes.</span>
<a name="l01783"></a>01783 <span class="comment"> * update_nid Time of updating the assigned node IDs.</span>
<a name="l01784"></a>01784 <span class="comment"> * scv_acs_time Time of calculating the best splits. </span>
<a name="l01785"></a>01785 <span class="comment"> * prune_time Time of tree pruning.</span>
<a name="l01786"></a>01786 <span class="comment"> *</span>
<a name="l01787"></a>01787 <span class="comment"> */</span>
<a name="l01788"></a>01788 DROP TYPE IF EXISTS MADLIB_SCHEMA.__train_result;
<a name="l01789"></a>01789 CREATE TYPE MADLIB_SCHEMA.__train_result AS
<a name="l01790"></a>01790 (
<a name="l01791"></a>01791 num_of_samples BIGINT,
<a name="l01792"></a>01792 features_per_node INT,
<a name="l01793"></a>01793 num_tree_nodes INT,
<a name="l01794"></a>01794 max_tree_depth INT,
<a name="l01795"></a>01795 calc_acc_time INTERVAL,
<a name="l01796"></a>01796 calc_pre_time INTERVAL,
<a name="l01797"></a>01797 update_time INTERVAL,
<a name="l01798"></a>01798 update_best INTERVAL,
<a name="l01799"></a>01799 update_child INTERVAL,
<a name="l01800"></a>01800 update_nid INTERVAL,
<a name="l01801"></a>01801 scv_acs_time INTERVAL,
<a name="l01802"></a>01802 prune_time INTERVAL
<a name="l01803"></a>01803 );
<a name="l01804"></a>01804
<a name="l01805"></a>01805
<a name="l01806"></a>01806 <span class="comment">/*</span>
<a name="l01807"></a>01807 <span class="comment"> * @brief The function samples a set of integer values between low and high.</span>
<a name="l01808"></a>01808 <span class="comment"> *</span>
<a name="l01809"></a>01809 <span class="comment"> * @param num_of_samples The number of records to be sampled.</span>
<a name="l01810"></a>01810 <span class="comment"> * @param low The low limit of sampled values.</span>
<a name="l01811"></a>01811 <span class="comment"> * @param high The high limit of sampled values.</span>
<a name="l01812"></a>01812 <span class="comment"> *</span>
<a name="l01813"></a>01813 <span class="comment"> * @return A set of integer values sampled randomly between [low, high].</span>
<a name="l01814"></a>01814 <span class="comment"> *</span>
<a name="l01815"></a>01815 <span class="comment"> */</span>
<a name="l01816"></a>01816 DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__sample_within_range
<a name="l01817"></a>01817 (
<a name="l01818"></a>01818 BIGINT,
<a name="l01819"></a>01819 BIGINT,
<a name="l01820"></a>01820 BIGINT
<a name="l01821"></a>01821 )CASCADE;
<a name="l01822"></a>01822 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__sample_within_range
<a name="l01823"></a>01823 (
<a name="l01824"></a>01824 num_of_samples BIGINT,
<a name="l01825"></a>01825 low BIGINT,
<a name="l01826"></a>01826 high BIGINT
<a name="l01827"></a>01827 )
<a name="l01828"></a>01828 RETURNS SETOF BIGINT
<a name="l01829"></a>01829 AS &#39;MODULE_PATHNAME&#39;, &#39;dt_sample_within_range&#39;
<a name="l01830"></a>01830 LANGUAGE C STRICT VOLATILE;
<a name="l01831"></a>01831
<a name="l01832"></a>01832
<a name="l01833"></a>01833 <span class="comment">/*</span>
<a name="l01834"></a>01834 <span class="comment"> * @brief The function samples with replacement from source table and store</span>
<a name="l01835"></a>01835 <span class="comment"> * the results to target table.</span>
<a name="l01836"></a>01836 <span class="comment"> * </span>
<a name="l01837"></a>01837 <span class="comment"> * In this function, we firstly calculate how many samples should be</span>
<a name="l01838"></a>01838 <span class="comment"> * generated in each segment. Then, we let those segments sample with</span>
<a name="l01839"></a>01839 <span class="comment"> * replacement between the maximum ID and minimum ID of the source table </span>
<a name="l01840"></a>01840 <span class="comment"> * in parallel and assign samples to different trees. </span>
<a name="l01841"></a>01841 <span class="comment"> *</span>
<a name="l01842"></a>01842 <span class="comment"> * If there are gaps in the ID column of the source table, we sample</span>
<a name="l01843"></a>01843 <span class="comment"> * extra records in proportion to the number of gaps. At last, we remove</span>
<a name="l01844"></a>01844 <span class="comment"> * these invalid samples with an inner join operation with the source</span>
<a name="l01845"></a>01845 <span class="comment"> * table. Since we target big data, this strategy works quite well.</span>
<a name="l01846"></a>01846 <span class="comment"> *</span>
<a name="l01847"></a>01847 <span class="comment"> * @param num_of_tree The number of trees to be trained.</span>
<a name="l01848"></a>01848 <span class="comment"> * @param size_per_tree The number of records to be sampled for each tree.</span>
<a name="l01849"></a>01849 <span class="comment"> * @param src_table The name of the table to be sampled from.</span>
<a name="l01850"></a>01850 <span class="comment"> * @param target_table The name of the table used to store the results.</span>
<a name="l01851"></a>01851 <span class="comment"> *</span>
<a name="l01852"></a>01852 <span class="comment"> */</span>
<a name="l01853"></a>01853 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__sample_with_replacement
<a name="l01854"></a>01854 (
<a name="l01855"></a>01855 num_of_tree INT,
<a name="l01856"></a>01856 size_per_tree INT,
<a name="l01857"></a>01857 src_table TEXT,
<a name="l01858"></a>01858 target_table TEXT
<a name="l01859"></a>01859 )
<a name="l01860"></a>01860 RETURNS VOID AS $$
<a name="l01861"></a>01861 DECLARE
<a name="l01862"></a>01862 segment_num INT;
<a name="l01863"></a>01863 sample_per_seg INT;
<a name="l01864"></a>01864 sample_ratio FLOAT8;
<a name="l01865"></a>01865 record_num FLOAT8;
<a name="l01866"></a>01866 min_id INT;
<a name="l01867"></a>01867 max_id INT;
<a name="l01868"></a>01868 range FLOAT8;
<a name="l01869"></a>01869 stmt TEXT;
<a name="l01870"></a>01870 BEGIN
<a name="l01871"></a>01871
<a name="l01872"></a>01872 m4_changequote(`&gt;&gt;&gt;&#39;, `&lt;&lt;&lt;&#39;)
<a name="l01873"></a>01873 m4_ifdef(&gt;&gt;&gt;__GREENPLUM__&lt;&lt;&lt;, &gt;&gt;&gt;
<a name="l01874"></a>01874 -- get the segment number
<a name="l01875"></a>01875 SELECT COUNT(distinct content) FROM gp_segment_configuration
<a name="l01876"></a>01876 WHERE content&lt;&gt;-1 INTO segment_num;
<a name="l01877"></a>01877 &lt;&lt;&lt;, &gt;&gt;&gt;
<a name="l01878"></a>01878 -- fix the segment number to 1 for PG
<a name="l01879"></a>01879 segment_num = 1;
<a name="l01880"></a>01880 &lt;&lt;&lt;)
<a name="l01881"></a>01881 m4_changequote(&gt;&gt;&gt;`&lt;&lt;&lt;, &gt;&gt;&gt;&#39;&lt;&lt;&lt;)
<a name="l01882"></a>01882
<a name="l01883"></a>01883
<a name="l01884"></a>01884 DROP TABLE IF EXISTS auxiliary_segment_table;
<a name="l01885"></a>01885 CREATE TEMP TABLE auxiliary_segment_table
<a name="l01886"></a>01886 (
<a name="l01887"></a>01887 segment_id INT
<a name="l01888"></a>01888 ) m4_ifdef(`__GREENPLUM__&#39;, `DISTRIBUTED BY(segment_id)&#39;);
<a name="l01889"></a>01889
<a name="l01890"></a>01890 -- Insert segment_num of records distributed by segment id
<a name="l01891"></a>01891 EXECUTE &#39;INSERT INTO auxiliary_segment_table
<a name="l01892"></a>01892 SELECT generate_series(1,&#39;||segment_num||&#39;);&#39;;
<a name="l01893"></a>01893
<a name="l01894"></a>01894 EXECUTE &#39;SELECT max(id),min(id), count(id) as record_num
<a name="l01895"></a>01895 FROM &#39;||src_table||&#39;;&#39; INTO max_id,min_id,record_num;
<a name="l01896"></a>01896 range=max_id-min_id+1;
<a name="l01897"></a>01897
<a name="l01898"></a>01898 -- compute the sample ratio
<a name="l01899"></a>01899 sample_ratio= range/record_num;
<a name="l01900"></a>01900
<a name="l01901"></a>01901 -- compute how many records should be sampled by each segment
<a name="l01902"></a>01902 sample_per_seg=((sample_ratio*num_of_tree*size_per_tree)/segment_num)::INT;
<a name="l01903"></a>01903
<a name="l01904"></a>01904 -- add the weight field
<a name="l01905"></a>01905
<a name="l01906"></a>01906 IF (range &gt; record_num) THEN
<a name="l01907"></a>01907 -- remove those invalid samples with join operation
<a name="l01908"></a>01908 stmt = MADLIB_SCHEMA.__format
<a name="l01909"></a>01909 (
<a name="l01910"></a>01910 &#39;INSERT INTO %(id, tid, nid, weight)
<a name="l01911"></a>01911 SELECT record_id,
<a name="l01912"></a>01912 tid AS tid,
<a name="l01913"></a>01913 tid AS nid,
<a name="l01914"></a>01914 count(*) AS weight
<a name="l01915"></a>01915 FROM
<a name="l01916"></a>01916 (
<a name="l01917"></a>01917 SELECT MADLIB_SCHEMA.__sample_within_range(%, %, %) AS record_id,
<a name="l01918"></a>01918 MADLIB_SCHEMA.__sample_within_range(%, 1, %) AS tid
<a name="l01919"></a>01919 FROM auxiliary_segment_table
<a name="l01920"></a>01920 ) t,
<a name="l01921"></a>01921 % k
<a name="l01922"></a>01922 WHERE t.record_id=k.id
<a name="l01923"></a>01923 GROUP BY record_id, tid, nid&#39;,
<a name="l01924"></a>01924 ARRAY[
<a name="l01925"></a>01925 target_table,
<a name="l01926"></a>01926 sample_per_seg::TEXT,
<a name="l01927"></a>01927 min_id::TEXT,
<a name="l01928"></a>01928 max_id::TEXT,
<a name="l01929"></a>01929 sample_per_seg::TEXT,
<a name="l01930"></a>01930 num_of_tree::TEXT,
<a name="l01931"></a>01931 src_table
<a name="l01932"></a>01932 ]
<a name="l01933"></a>01933 );
<a name="l01934"></a>01934 ELSE
<a name="l01935"></a>01935 stmt = MADLIB_SCHEMA.__format
<a name="l01936"></a>01936 (
<a name="l01937"></a>01937 &#39;INSERT INTO %(id, tid, nid, weight)
<a name="l01938"></a>01938 SELECT record_id,
<a name="l01939"></a>01939 tid AS tid,
<a name="l01940"></a>01940 tid AS nid,
<a name="l01941"></a>01941 count(*) AS weight
<a name="l01942"></a>01942 FROM
<a name="l01943"></a>01943 (
<a name="l01944"></a>01944 SELECT MADLIB_SCHEMA.__sample_within_range(%, %, %) AS record_id,
<a name="l01945"></a>01945 MADLIB_SCHEMA.__sample_within_range(%, 1, %) AS tid
<a name="l01946"></a>01946 FROM auxiliary_segment_table
<a name="l01947"></a>01947 ) t
<a name="l01948"></a>01948 GROUP BY record_id, tid, nid&#39;,
<a name="l01949"></a>01949 ARRAY[
<a name="l01950"></a>01950 target_table,
<a name="l01951"></a>01951 sample_per_seg::TEXT,
<a name="l01952"></a>01952 min_id::TEXT,
<a name="l01953"></a>01953 max_id::TEXT,
<a name="l01954"></a>01954 sample_per_seg::TEXT,
<a name="l01955"></a>01955 num_of_tree::TEXT
<a name="l01956"></a>01956 ]
<a name="l01957"></a>01957 );
<a name="l01958"></a>01958 END IF;
<a name="l01959"></a>01959
<a name="l01960"></a>01960 EXECUTE stmt;
<a name="l01961"></a>01961 END
<a name="l01962"></a>01962 $$ LANGUAGE PLPGSQL VOLATILE;
<a name="l01963"></a>01963
<a name="l01964"></a>01964
<a name="l01965"></a>01965 <span class="comment">/*</span>
<a name="l01966"></a>01966 <span class="comment"> * @brief This function trains a decision tree or random forest.</span>
<a name="l01967"></a>01967 <span class="comment"> *</span>
<a name="l01968"></a>01968 <span class="comment"> * @param split_criterion This parameter specifies which split criterion </span>
<a name="l01969"></a>01969 <span class="comment"> * should be used for tree construction and </span>
<a name="l01970"></a>01970 <span class="comment"> * pruning. The valid values are infogain, </span>
<a name="l01971"></a>01971 <span class="comment"> * gainratio, and gini.</span>
<a name="l01972"></a>01972 <span class="comment"> * @param num_trees Total number of trees to be trained. </span>
<a name="l01973"></a>01973 <span class="comment"> * @param features_per_node Total number of features used to compute split </span>
<a name="l01974"></a>01974 <span class="comment"> * gain for each node. </span>
<a name="l01975"></a>01975 <span class="comment"> * @param training_table_name The name of the table/view with the source data. </span>
<a name="l01976"></a>01976 <span class="comment"> * @param training_table_meta The name of the table with the meta data. </span>
<a name="l01977"></a>01977 <span class="comment"> * @param result_tree_table_name The name of the table where the resulting </span>
<a name="l01978"></a>01978 <span class="comment"> * DT/RF will be stored. </span>
<a name="l01979"></a>01979 <span class="comment"> * @param validation_table_name The validation table used for pruning tree. </span>
<a name="l01980"></a>01980 <span class="comment"> * @param id_col_name The name of the column containing id of each point. </span>
<a name="l01981"></a>01981 <span class="comment"> * @param class_col_name The name of the column containing correct class </span>
<a name="l01982"></a>01982 <span class="comment"> * of each point. </span>
<a name="l01983"></a>01983 <span class="comment"> * @param confidence_level A statistical confidence interval of the </span>
<a name="l01984"></a>01984 <span class="comment"> * resubstitution error. </span>
<a name="l01985"></a>01985 <span class="comment"> * @param max_tree_depth Maximum decision tree depth. </span>
<a name="l01986"></a>01986 <span class="comment"> * @param node_prune_threshold Specifies the minimum number of samples required </span>
<a name="l01987"></a>01987 <span class="comment"> * in a child node. </span>
<a name="l01988"></a>01988 <span class="comment"> * @param node_split_threshold Specifies the minimum number of samples required </span>
<a name="l01989"></a>01989 <span class="comment"> * in a node in order for a further split </span>
<a name="l01990"></a>01990 <span class="comment"> * to be possible. </span>
<a name="l01991"></a>01991 <span class="comment"> * @param sampling_needed Whether enabling the sampling functionality. </span>
<a name="l01992"></a>01992 <span class="comment"> * @param h2hmv_routine_id Specifies how to handle missing values. </span>
<a name="l01993"></a>01993 <span class="comment"> * 1 ignore, 2 explicit.</span>
<a name="l01994"></a>01994 <span class="comment"> * @param verbosity &gt; 0 means this function runs in verbose mode. </span>
<a name="l01995"></a>01995 <span class="comment"> * </span>
<a name="l01996"></a>01996 <span class="comment"> * @return The record including training related information.</span>
<a name="l01997"></a>01997 <span class="comment"> * Details please refer to the UDT: MADLIB_SCHEMA.__train_result.</span>
<a name="l01998"></a>01998 <span class="comment"> *</span>
<a name="l01999"></a>01999 <span class="comment"> */</span>
<a name="l02000"></a>02000 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__train_tree
<a name="l02001"></a>02001 (
<a name="l02002"></a>02002 split_criterion TEXT,
<a name="l02003"></a>02003 num_trees INT,
<a name="l02004"></a>02004 features_per_node INT,
<a name="l02005"></a>02005 training_table_name TEXT,
<a name="l02006"></a>02006 training_table_meta TEXT,
<a name="l02007"></a>02007 result_tree_table_name TEXT,
<a name="l02008"></a>02008 validation_table_name TEXT,
<a name="l02009"></a>02009 id_col_name TEXT,
<a name="l02010"></a>02010 class_col_name TEXT,
<a name="l02011"></a>02011 confidence_level FLOAT,
<a name="l02012"></a>02012 max_tree_depth INT,
<a name="l02013"></a>02013 sampling_percentage FLOAT,
<a name="l02014"></a>02014 node_prune_threshold FLOAT,
<a name="l02015"></a>02015 node_split_threshold FLOAT,
<a name="l02016"></a>02016 sampling_needed BOOLEAN,
<a name="l02017"></a>02017 h2hmv_routine_id INT,
<a name="l02018"></a>02018 verbosity INT
<a name="l02019"></a>02019 )
<a name="l02020"></a>02020 RETURNS MADLIB_SCHEMA.__train_result AS $$
<a name="l02021"></a>02021 DECLARE
<a name="l02022"></a>02022 num_live_nodes INT;
<a name="l02023"></a>02023 max_nid INT;
<a name="l02024"></a>02024 location INT[];
<a name="l02025"></a>02025 temp_location INT[];
<a name="l02026"></a>02026 num_classes INT;
<a name="l02027"></a>02027 answer record;
<a name="l02028"></a>02028 location_size INT;
<a name="l02029"></a>02029 begin_func_exec TIMESTAMP;
<a name="l02030"></a>02030 begin_find_best TIMESTAMP;
<a name="l02031"></a>02031 scv_acs_time INTERVAL;
<a name="l02032"></a>02032 begin_data_transfer TIMESTAMP;
<a name="l02033"></a>02033 begin_update_best TIMESTAMP;
<a name="l02034"></a>02034 begin_update_child TIMESTAMP;
<a name="l02035"></a>02035 begin_update_nid TIMESTAMP;
<a name="l02036"></a>02036 calc_update_best INTERVAL;
<a name="l02037"></a>02037 calc_update_child INTERVAL;
<a name="l02038"></a>02038 calc_update_nid INTERVAL;
<a name="l02039"></a>02039 ins_upd_time INTERVAL;
<a name="l02040"></a>02040 begin_olap_acs TIMESTAMP;
<a name="l02041"></a>02041 calc_acc_time INTERVAL;
<a name="l02042"></a>02042 calc_pre_time INTERVAL;
<a name="l02043"></a>02043 calc_olap_time INTERVAL;
<a name="l02044"></a>02044 begin_bld_assoc TIMESTAMP;
<a name="l02045"></a>02045 bld_assoc_time INTERVAL;
<a name="l02046"></a>02046 begin_prune TIMESTAMP;
<a name="l02047"></a>02047 prune_time INTERVAL;
<a name="l02048"></a>02048 total_size FLOAT;
<a name="l02049"></a>02049 sc_code INT := 1;
<a name="l02050"></a>02050 curstmt TEXT := &#39;&#39;;
<a name="l02051"></a>02051 grow_tree INT := max_tree_depth;
<a name="l02052"></a>02052 ret MADLIB_SCHEMA.__train_result;
<a name="l02053"></a>02053 curr_level INT := 1;
<a name="l02054"></a>02054 dp_ids INT[];
<a name="l02055"></a>02055 dp_ids_text TEXT;
<a name="l02056"></a>02056 instance_time MADLIB_SCHEMA.__gen_acc_time;
<a name="l02057"></a>02057 tr_table_index INT := 1;
<a name="l02058"></a>02058 tr_tables TEXT[] := &#39;{tr_assoc_ping, tr_assoc_pong}&#39;;
<a name="l02059"></a>02059 cur_tr_table TEXT := &#39;tr_assoc_ping&#39;;
<a name="l02060"></a>02060 need_analyze BOOL := &#39;t&#39;::BOOL;
<a name="l02061"></a>02061 attr_count INT;
<a name="l02062"></a>02062 BEGIN
<a name="l02063"></a>02063 -- record the time costed in different steps when training
<a name="l02064"></a>02064 begin_func_exec = clock_timestamp();
<a name="l02065"></a>02065 scv_acs_time = begin_func_exec - begin_func_exec;
<a name="l02066"></a>02066 calc_olap_time = scv_acs_time;
<a name="l02067"></a>02067 calc_acc_time = scv_acs_time;
<a name="l02068"></a>02068 calc_pre_time = scv_acs_time;
<a name="l02069"></a>02069 ins_upd_time = scv_acs_time;
<a name="l02070"></a>02070 calc_update_best = scv_acs_time;
<a name="l02071"></a>02071 calc_update_child = scv_acs_time;
<a name="l02072"></a>02072 calc_update_nid = scv_acs_time;
<a name="l02073"></a>02073 bld_assoc_time = scv_acs_time;
<a name="l02074"></a>02074 prune_time = scv_acs_time;
<a name="l02075"></a>02075
<a name="l02076"></a>02076 IF(split_criterion = &#39;infogain&#39;) THEN
<a name="l02077"></a>02077 sc_code = 1;
<a name="l02078"></a>02078 ELSIF (split_criterion = &#39;gainratio&#39;) THEN
<a name="l02079"></a>02079 sc_code = 2;
<a name="l02080"></a>02080 ELSIF (split_criterion = &#39;gini&#39;) THEN
<a name="l02081"></a>02081 sc_code = 3;
<a name="l02082"></a>02082 ELSE
<a name="l02083"></a>02083 RAISE EXCEPTION &#39;%&#39;, &#39;Invalid split criterion!&#39;;
<a name="l02084"></a>02084 END IF;
<a name="l02085"></a>02085
<a name="l02086"></a>02086 num_classes = MADLIB_SCHEMA.__num_of_class(training_table_meta);
<a name="l02087"></a>02087
<a name="l02088"></a>02088 IF(verbosity &gt; 0) THEN
<a name="l02089"></a>02089 RAISE INFO &#39;NUMBER OF CLASSES IN THE TRAINING SET %&#39;, num_classes;
<a name="l02090"></a>02090 END IF;
<a name="l02091"></a>02091
<a name="l02092"></a>02092 IF(num_classes &lt; 2) THEN
<a name="l02093"></a>02093 RAISE EXCEPTION &#39;the number of classes must be greater than 2&#39;;
<a name="l02094"></a>02094 END IF;
<a name="l02095"></a>02095
<a name="l02096"></a>02096 curstmt = MADLIB_SCHEMA.__format
<a name="l02097"></a>02097 (
<a name="l02098"></a>02098 &#39;SELECT
<a name="l02099"></a>02099 count(*)
<a name="l02100"></a>02100 FROM %
<a name="l02101"></a>02101 WHERE column_type=&#39;&#39;f&#39;&#39;&#39;,
<a name="l02102"></a>02102 training_table_meta
<a name="l02103"></a>02103 );
<a name="l02104"></a>02104 EXECUTE curstmt INTO attr_count;
<a name="l02105"></a>02105
<a name="l02106"></a>02106 -- generate the horizontal table for updating assinged node IDs
<a name="l02107"></a>02107 PERFORM MADLIB_SCHEMA.__gen_horizontal_encoded_table
<a name="l02108"></a>02108 (
<a name="l02109"></a>02109 &#39;tmp_dt_hori_table&#39;,
<a name="l02110"></a>02110 training_table_name,
<a name="l02111"></a>02111 attr_count,
<a name="l02112"></a>02112 verbosity
<a name="l02113"></a>02113 );
<a name="l02114"></a>02114
<a name="l02115"></a>02115 EXECUTE &#39;SELECT count(*) FROM tmp_dt_hori_table&#39; INTO total_size;
<a name="l02116"></a>02116
<a name="l02117"></a>02117 IF(verbosity &gt; 0) THEN
<a name="l02118"></a>02118 RAISE INFO &#39;INPUT TABLE SIZE: %&#39;, total_size;
<a name="l02119"></a>02119 END IF;
<a name="l02120"></a>02120
<a name="l02121"></a>02121 begin_bld_assoc = clock_timestamp();
<a name="l02122"></a>02122 cur_tr_table = tr_tables[tr_table_index];
<a name="l02123"></a>02123
<a name="l02124"></a>02124 -- The table of tr_assoc holds the information of which records are
<a name="l02125"></a>02125 -- used during training for each tree.
<a name="l02126"></a>02126 -- It has four columns.
<a name="l02127"></a>02127 -- id -- The id of one record.
<a name="l02128"></a>02128 -- tid -- The id of a tree.
<a name="l02129"></a>02129 -- nid -- The id of a node in a tree.
<a name="l02130"></a>02130 -- weight -- The times a record is assigned to a node.
<a name="l02131"></a>02131 IF (sampling_needed) THEN
<a name="l02132"></a>02132 PERFORM MADLIB_SCHEMA.__sample_with_replacement
<a name="l02133"></a>02133 (
<a name="l02134"></a>02134 num_trees,
<a name="l02135"></a>02135 round(sampling_percentage * total_size)::INT,
<a name="l02136"></a>02136 &#39;tmp_dt_hori_table&#39;,
<a name="l02137"></a>02137 cur_tr_table
<a name="l02138"></a>02138 );
<a name="l02139"></a>02139 ELSE
<a name="l02140"></a>02140 curstmt = MADLIB_SCHEMA.__format
<a name="l02141"></a>02141 (
<a name="l02142"></a>02142 &#39;INSERT INTO %
<a name="l02143"></a>02143 SELECT id, 1 as tid, 1 as nid, 1 as weight
<a name="l02144"></a>02144 FROM %&#39;,
<a name="l02145"></a>02145 ARRAY[
<a name="l02146"></a>02146 cur_tr_table,
<a name="l02147"></a>02147 &#39;tmp_dt_hori_table&#39;
<a name="l02148"></a>02148 ]
<a name="l02149"></a>02149 );
<a name="l02150"></a>02150 EXECUTE curstmt;
<a name="l02151"></a>02151 END IF;
<a name="l02152"></a>02152
<a name="l02153"></a>02153 -- analyze ping
<a name="l02154"></a>02154 EXECUTE &#39;ANALYZE &#39; || cur_tr_table;
<a name="l02155"></a>02155 bld_assoc_time = clock_timestamp() - begin_bld_assoc;
<a name="l02156"></a>02156
<a name="l02157"></a>02157 -- generate the root node for all trees.
<a name="l02158"></a>02158 -- the generated numbers are the same for the two generate_series
<a name="l02159"></a>02159 SELECT MADLIB_SCHEMA.__format
<a name="l02160"></a>02160 (
<a name="l02161"></a>02161 &#39;INSERT INTO %
<a name="l02162"></a>02162 (id, tree_location, feature, probability, max_class,scv,
<a name="l02163"></a>02163 live, num_of_samples, parent_id, tid)
<a name="l02164"></a>02164 SELECT generate_series(1, %), ARRAY[0], 0, 1, 1, 1, 1, 0, 0,
<a name="l02165"></a>02165 generate_series(1, %)&#39;,
<a name="l02166"></a>02166 ARRAY[
<a name="l02167"></a>02167 result_tree_table_name,
<a name="l02168"></a>02168 num_trees::TEXT,
<a name="l02169"></a>02169 num_trees::TEXT
<a name="l02170"></a>02170 ]
<a name="l02171"></a>02171 ) INTO curstmt;
<a name="l02172"></a>02172
<a name="l02173"></a>02173 EXECUTE curstmt;
<a name="l02174"></a>02174
<a name="l02175"></a>02175 max_nid = num_trees;
<a name="l02176"></a>02176 location_size = 0;
<a name="l02177"></a>02177
<a name="l02178"></a>02178
<a name="l02179"></a>02179 LOOP
<a name="l02180"></a>02180 EXECUTE &#39;SELECT COUNT(id) FROM &#39; || result_tree_table_name ||
<a name="l02181"></a>02181 &#39; WHERE live &gt; 0 AND array_upper(tree_location,1)=&#39;||
<a name="l02182"></a>02182 curr_level||&#39;;&#39; INTO num_live_nodes;
<a name="l02183"></a>02183
<a name="l02184"></a>02184 IF (num_live_nodes &lt; 1) THEN
<a name="l02185"></a>02185 IF(verbosity &gt; 0) THEN
<a name="l02186"></a>02186 RAISE INFO &#39;EXIT: %&#39;, &#39;no live nodes to split&#39;;
<a name="l02187"></a>02187 END IF;
<a name="l02188"></a>02188
<a name="l02189"></a>02189 EXIT;
<a name="l02190"></a>02190 END IF;
<a name="l02191"></a>02191
<a name="l02192"></a>02192 IF (verbosity &gt; 0) THEN
<a name="l02193"></a>02193 RAISE INFO &#39;Running on level:%&#39;, curr_level;
<a name="l02194"></a>02194 END IF;
<a name="l02195"></a>02195
<a name="l02196"></a>02196 begin_olap_acs = clock_timestamp();
<a name="l02197"></a>02197
<a name="l02198"></a>02198 instance_time = MADLIB_SCHEMA.__gen_acc
<a name="l02199"></a>02199 (
<a name="l02200"></a>02200 training_table_name,
<a name="l02201"></a>02201 training_table_meta,
<a name="l02202"></a>02202 result_tree_table_name,
<a name="l02203"></a>02203 cur_tr_table,
<a name="l02204"></a>02204 &#39;sf_assoc&#39;,
<a name="l02205"></a>02205 features_per_node,
<a name="l02206"></a>02206 num_classes,
<a name="l02207"></a>02207 sampling_needed,
<a name="l02208"></a>02208 verbosity
<a name="l02209"></a>02209 );
<a name="l02210"></a>02210
<a name="l02211"></a>02211 IF (h2hmv_routine_id=1) THEN
<a name="l02212"></a>02212 -- For ignore, we need the true size of nodes to handle the
<a name="l02213"></a>02213 -- missing values.
<a name="l02214"></a>02214 TRUNCATE node_size_aux;
<a name="l02215"></a>02215
<a name="l02216"></a>02216 curstmt = MADLIB_SCHEMA.__format
<a name="l02217"></a>02217 (
<a name="l02218"></a>02218 &#39;INSERT INTO node_size_aux
<a name="l02219"></a>02219 SELECT tr.tid, tr.nid, sum(weight) as count
<a name="l02220"></a>02220 FROM % tr
<a name="l02221"></a>02221 GROUP BY tr.tid, tr.nid&#39;,
<a name="l02222"></a>02222 cur_tr_table
<a name="l02223"></a>02223 );
<a name="l02224"></a>02224
<a name="l02225"></a>02225 EXECUTE curstmt;
<a name="l02226"></a>02226 END IF;
<a name="l02227"></a>02227
<a name="l02228"></a>02228 calc_pre_time = calc_pre_time + instance_time.calc_pre_time;
<a name="l02229"></a>02229 calc_acc_time = calc_acc_time + instance_time.calc_acc_time;
<a name="l02230"></a>02230 calc_olap_time = calc_olap_time + (clock_timestamp() - begin_olap_acs);
<a name="l02231"></a>02231
<a name="l02232"></a>02232 curr_level = curr_level + 1;
<a name="l02233"></a>02233
<a name="l02234"></a>02234 begin_find_best = clock_timestamp();
<a name="l02235"></a>02235
<a name="l02236"></a>02236 PERFORM MADLIB_SCHEMA.__find_best_split
<a name="l02237"></a>02237 (
<a name="l02238"></a>02238 &#39;training_instance&#39;,
<a name="l02239"></a>02239 confidence_level,
<a name="l02240"></a>02240 training_table_meta,
<a name="l02241"></a>02241 sc_code,
<a name="l02242"></a>02242 grow_tree,
<a name="l02243"></a>02243 &#39;find_best_answer_table&#39;,
<a name="l02244"></a>02244 h2hmv_routine_id,
<a name="l02245"></a>02245 num_classes
<a name="l02246"></a>02246 );
<a name="l02247"></a>02247 IF (verbosity &gt; 0) THEN
<a name="l02248"></a>02248 RAISE INFO &#39;find best time at this level:%&#39;,
<a name="l02249"></a>02249 clock_timestamp() - begin_find_best;
<a name="l02250"></a>02250 END IF;
<a name="l02251"></a>02251 grow_tree = grow_tree - 1;
<a name="l02252"></a>02252
<a name="l02253"></a>02253 scv_acs_time = scv_acs_time +
<a name="l02254"></a>02254 (clock_timestamp() - begin_find_best);
<a name="l02255"></a>02255 begin_data_transfer = clock_timestamp();
<a name="l02256"></a>02256 begin_update_best = clock_timestamp();
<a name="l02257"></a>02257
<a name="l02258"></a>02258 -- We get the calculation result for current level.
<a name="l02259"></a>02259 -- Update the nodes of previous level firstly.
<a name="l02260"></a>02260 SELECT MADLIB_SCHEMA.__format
<a name="l02261"></a>02261 (
<a name="l02262"></a>02262 &#39;UPDATE % t
<a name="l02263"></a>02263 SET feature = c.feature,
<a name="l02264"></a>02264 probability = c.probability,
<a name="l02265"></a>02265 max_class = c.max_class,
<a name="l02266"></a>02266 scv = c.max_scv,
<a name="l02267"></a>02267 ebp_coeff = c.ebp_coeff,
<a name="l02268"></a>02268 num_of_samples = c.node_size,
<a name="l02269"></a>02269 live = 0,
<a name="l02270"></a>02270 is_cont = c.is_cont,
<a name="l02271"></a>02271 split_value = c.split_value
<a name="l02272"></a>02272 FROM find_best_answer_table c
<a name="l02273"></a>02273 WHERE t.id=c.node_id AND t.tid=c.tid&#39;,
<a name="l02274"></a>02274 ARRAY[
<a name="l02275"></a>02275 result_tree_table_name::TEXT
<a name="l02276"></a>02276 ]
<a name="l02277"></a>02277 ) INTO curstmt;
<a name="l02278"></a>02278
<a name="l02279"></a>02279 EXECUTE curstmt;
<a name="l02280"></a>02280
<a name="l02281"></a>02281 calc_update_best = calc_update_best +
<a name="l02282"></a>02282 (clock_timestamp() - begin_update_best);
<a name="l02283"></a>02283 begin_update_child = clock_timestamp();
<a name="l02284"></a>02284
<a name="l02285"></a>02285 curstmt=
<a name="l02286"></a>02286 MADLIB_SCHEMA.__format(
<a name="l02287"></a>02287 &#39;INSERT INTO %(id, tree_location, feature, probability,
<a name="l02288"></a>02288 max_class, scv, live, parent_id, tid, dp_ids)
<a name="l02289"></a>02289 SELECT %+row, array_append(tree_location, fval),
<a name="l02290"></a>02290 0, 1, 1, 1, %, ans.node_id, ans.tid,
<a name="l02291"></a>02291 CASE when(NOT ans.is_cont) then
<a name="l02292"></a>02292 array_append( dp_ids, ans.feature)
<a name="l02293"></a>02293 ELSE
<a name="l02294"></a>02294 dp_ids
<a name="l02295"></a>02295 END
<a name="l02296"></a>02296 FROM % tree,
<a name="l02297"></a>02297 (
<a name="l02298"></a>02298 SELECT *,
<a name="l02299"></a>02299 row_number()
<a name="l02300"></a>02300 OVER (ORDER BY l.tid, l.node_id, l.fval) AS row
<a name="l02301"></a>02301 FROM
<a name="l02302"></a>02302 (
<a name="l02303"></a>02303 SELECT *,
<a name="l02304"></a>02304 CASE WHEN (is_cont) THEN
<a name="l02305"></a>02305 generate_series(1,2)
<a name="l02306"></a>02306 ELSE
<a name="l02307"></a>02307 generate_series(1, distinct_features)
<a name="l02308"></a>02308 END AS fval
<a name="l02309"></a>02309 FROM
<a name="l02310"></a>02310 find_best_answer_table
<a name="l02311"></a>02311 WHERE live&gt;0 AND coalesce(feature, 0) &lt;&gt; 0
<a name="l02312"></a>02312 AND node_size &gt;= % AND node_size &gt;= %
<a name="l02313"></a>02313 ) l
<a name="l02314"></a>02314 ) ans
<a name="l02315"></a>02315 WHERE tree.id=ans.node_id and tree.tid=ans.tid;&#39;,
<a name="l02316"></a>02316 ARRAY[
<a name="l02317"></a>02317 result_tree_table_name,
<a name="l02318"></a>02318 (max_nid)::TEXT,
<a name="l02319"></a>02319 curr_level::TEXT,
<a name="l02320"></a>02320 result_tree_table_name,
<a name="l02321"></a>02321 (total_size * node_prune_threshold)::TEXT,
<a name="l02322"></a>02322 (total_size * node_split_threshold)::TEXT
<a name="l02323"></a>02323 ]
<a name="l02324"></a>02324 );
<a name="l02325"></a>02325 IF(verbosity &gt; 0) THEN
<a name="l02326"></a>02326 RAISE INFO &#39;Generate Child Nodes:%&#39;, curstmt;
<a name="l02327"></a>02327 END IF;
<a name="l02328"></a>02328
<a name="l02329"></a>02329 EXECUTE curstmt;
<a name="l02330"></a>02330
<a name="l02331"></a>02331 EXECUTE &#39;SELECT max(id) FROM &#39;||result_tree_table_name INTO max_nid;
<a name="l02332"></a>02332
<a name="l02333"></a>02333 IF(verbosity &gt; 0) THEN
<a name="l02334"></a>02334 RAISE INFO &#39;Max nid:%, level:%&#39;, max_nid, curr_level;
<a name="l02335"></a>02335 END IF;
<a name="l02336"></a>02336
<a name="l02337"></a>02337 -- insert the leftmost child node id and relevant info
<a name="l02338"></a>02338 -- to the assoc_aux table, so that we will make use of this
<a name="l02339"></a>02339 -- info to update the assigned nid the samples belong to
<a name="l02340"></a>02340 -- the current node whose id is answer.node_id.
<a name="l02341"></a>02341 SELECT MADLIB_SCHEMA.__format
<a name="l02342"></a>02342 (
<a name="l02343"></a>02343 &#39;INSERT INTO assoc_aux
<a name="l02344"></a>02344 (nid, fid, lmc_id, svalue, is_cont)
<a name="l02345"></a>02345 SELECT t.id, t.feature, min(l.id),
<a name="l02346"></a>02346 t.split_value, t.is_cont
<a name="l02347"></a>02347 FROM
<a name="l02348"></a>02348 (SELECT id, parent_id
<a name="l02349"></a>02349 FROM %
<a name="l02350"></a>02350 WHERE array_upper(tree_location,1)=%) l,
<a name="l02351"></a>02351 % t
<a name="l02352"></a>02352 WHERE l.parent_id=t.id
<a name="l02353"></a>02353 GROUP BY t.id, t.feature, t.split_value, t.is_cont;&#39;,
<a name="l02354"></a>02354 ARRAY[
<a name="l02355"></a>02355 result_tree_table_name,
<a name="l02356"></a>02356 curr_level::TEXT,
<a name="l02357"></a>02357 result_tree_table_name
<a name="l02358"></a>02358 ]
<a name="l02359"></a>02359 ) INTO curstmt;
<a name="l02360"></a>02360
<a name="l02361"></a>02361 IF(verbosity &gt; 0) THEN
<a name="l02362"></a>02362 RAISE INFO &#39;Update lmc_child Info:%&#39;, curstmt;
<a name="l02363"></a>02363 END IF;
<a name="l02364"></a>02364
<a name="l02365"></a>02365 EXECUTE curstmt;
<a name="l02366"></a>02366
<a name="l02367"></a>02367 -- delete the unused nodes on the previous level
<a name="l02368"></a>02368 -- delete those nodes with a size less than node_prune_threshold
<a name="l02369"></a>02369 -- node_prune_threshold will not apply to root node,
<a name="l02370"></a>02370 -- the level is 1 (curr_level - 1 = 1);
<a name="l02371"></a>02371 IF (curr_level &gt; 2) THEN
<a name="l02372"></a>02372 curstmt = MADLIB_SCHEMA.__format
<a name="l02373"></a>02373 (
<a name="l02374"></a>02374 &#39;DELETE FROM % t
<a name="l02375"></a>02375 WHERE t.num_of_samples &lt; % OR live = %;&#39;,
<a name="l02376"></a>02376 ARRAY[
<a name="l02377"></a>02377 result_tree_table_name::TEXT,
<a name="l02378"></a>02378 (total_size * node_prune_threshold)::TEXT,
<a name="l02379"></a>02379 (curr_level - 1)::TEXT
<a name="l02380"></a>02380 ]
<a name="l02381"></a>02381 );
<a name="l02382"></a>02382 EXECUTE curstmt;
<a name="l02383"></a>02383 END IF;
<a name="l02384"></a>02384
<a name="l02385"></a>02385 calc_update_child = calc_update_child + (clock_timestamp() - begin_update_child);
<a name="l02386"></a>02386 begin_update_nid = clock_timestamp();
<a name="l02387"></a>02387
<a name="l02388"></a>02388 -- update the assigned node id for each sample on the current level
<a name="l02389"></a>02389 tr_table_index = (tr_table_index % 2) + 1;
<a name="l02390"></a>02390 curstmt = MADLIB_SCHEMA.__format
<a name="l02391"></a>02391 (
<a name="l02392"></a>02392 &#39;INSERT INTO % (id, nid, tid, weight)
<a name="l02393"></a>02393 SELECT
<a name="l02394"></a>02394 tr.id,
<a name="l02395"></a>02395 au.lmc_id - 1 +
<a name="l02396"></a>02396 CASE WHEN (au.is_cont) THEN
<a name="l02397"></a>02397 CASE WHEN (svalue &lt; vt.fvals[au.fid]) THEN
<a name="l02398"></a>02398 2
<a name="l02399"></a>02399 ELSE
<a name="l02400"></a>02400 1
<a name="l02401"></a>02401 END
<a name="l02402"></a>02402 ELSE
<a name="l02403"></a>02403 vt.fvals[au.fid]::INT
<a name="l02404"></a>02404 END AS nid,
<a name="l02405"></a>02405 tid, weight
<a name="l02406"></a>02406 FROM % tr, % vt, assoc_aux au
<a name="l02407"></a>02407 WHERE tr.nid = au.nid AND vt.id = tr.id AND vt.fvals[au.fid] IS NOT NULL&#39;,
<a name="l02408"></a>02408 ARRAY[
<a name="l02409"></a>02409 tr_tables[tr_table_index],
<a name="l02410"></a>02410 cur_tr_table,
<a name="l02411"></a>02411 &#39;tmp_dt_hori_table&#39;
<a name="l02412"></a>02412 ]
<a name="l02413"></a>02413 );
<a name="l02414"></a>02414 IF (verbosity &gt; 0) THEN
<a name="l02415"></a>02415 RAISE INFO &#39;%&#39;, curstmt;
<a name="l02416"></a>02416 END IF;
<a name="l02417"></a>02417
<a name="l02418"></a>02418 EXECUTE curstmt;
<a name="l02419"></a>02419 EXECUTE &#39;TRUNCATE &#39; || cur_tr_table;
<a name="l02420"></a>02420 cur_tr_table = tr_tables[tr_table_index];
<a name="l02421"></a>02421
<a name="l02422"></a>02422 IF (need_analyze) THEN
<a name="l02423"></a>02423 -- analyze pong table
<a name="l02424"></a>02424 EXECUTE &#39;ANALYZE &#39; || cur_tr_table;
<a name="l02425"></a>02425 need_analyze = &#39;f&#39;::BOOL;
<a name="l02426"></a>02426 END IF;
<a name="l02427"></a>02427
<a name="l02428"></a>02428 EXECUTE &#39;TRUNCATE assoc_aux&#39;;
<a name="l02429"></a>02429
<a name="l02430"></a>02430 calc_update_nid = calc_update_nid + (clock_timestamp() - begin_update_nid);
<a name="l02431"></a>02431
<a name="l02432"></a>02432 ins_upd_time = ins_upd_time +
<a name="l02433"></a>02433 (clock_timestamp() - begin_data_transfer);
<a name="l02434"></a>02434 IF(verbosity &gt; 0) THEN
<a name="l02435"></a>02435 RAISE INFO &#39;computation time in this level:%&#39;,
<a name="l02436"></a>02436 (clock_timestamp() - begin_find_best);
<a name="l02437"></a>02437 END IF;
<a name="l02438"></a>02438
<a name="l02439"></a>02439 END LOOP;
<a name="l02440"></a>02440
<a name="l02441"></a>02441 PERFORM MADLIB_SCHEMA.__generate_final_tree(result_tree_table_name);
<a name="l02442"></a>02442
<a name="l02443"></a>02443 begin_prune = clock_timestamp();
<a name="l02444"></a>02444 IF (confidence_level &lt; 100.0) THEN
<a name="l02445"></a>02445 PERFORM MADLIB_SCHEMA.__ebp_prune_tree(result_tree_table_name);
<a name="l02446"></a>02446 END IF;
<a name="l02447"></a>02447
<a name="l02448"></a>02448 IF (validation_table_name IS NOT NULL) THEN
<a name="l02449"></a>02449 PERFORM MADLIB_SCHEMA.__rep_prune_tree
<a name="l02450"></a>02450 (
<a name="l02451"></a>02451 result_tree_table_name,
<a name="l02452"></a>02452 validation_table_name ,
<a name="l02453"></a>02453 num_classes
<a name="l02454"></a>02454 );
<a name="l02455"></a>02455 END IF;
<a name="l02456"></a>02456 prune_time = clock_timestamp() - begin_prune;
<a name="l02457"></a>02457
<a name="l02458"></a>02458 IF(verbosity &gt; 0) THEN
<a name="l02459"></a>02459 RAISE INFO &#39;time of sampling with replacement: %&#39;, bld_assoc_time;
<a name="l02460"></a>02460 RAISE INFO &#39;time of finding best and calculating ACS: %&#39;, scv_acs_time;
<a name="l02461"></a>02461 RAISE INFO &#39;time of calculating ACC: %&#39;, calc_acc_time;
<a name="l02462"></a>02462 RAISE INFO &#39;time of Insert/update operation: %&#39;, ins_upd_time;
<a name="l02463"></a>02463 RAISE INFO &#39;time of pruning: %&#39;, prune_time;
<a name="l02464"></a>02464 RAISE INFO &#39;time of training: %&#39;, clock_timestamp() - begin_func_exec;
<a name="l02465"></a>02465 END IF;
<a name="l02466"></a>02466
<a name="l02467"></a>02467 SELECT MADLIB_SCHEMA.__format
<a name="l02468"></a>02468 (
<a name="l02469"></a>02469 &#39;SELECT COUNT(id), max(array_upper(tree_location, 1))
<a name="l02470"></a>02470 FROM %&#39;,
<a name="l02471"></a>02471 ARRAY[
<a name="l02472"></a>02472 result_tree_table_name
<a name="l02473"></a>02473 ]
<a name="l02474"></a>02474 ) INTO curstmt;
<a name="l02475"></a>02475
<a name="l02476"></a>02476 EXECUTE curstmt INTO ret.num_tree_nodes, ret.max_tree_depth;
<a name="l02477"></a>02477
<a name="l02478"></a>02478 ret.features_per_node = features_per_node;
<a name="l02479"></a>02479 ret.num_of_samples = total_size;
<a name="l02480"></a>02480 ret.calc_acc_time = calc_acc_time;
<a name="l02481"></a>02481 ret.calc_pre_time = calc_pre_time;
<a name="l02482"></a>02482 ret.update_time = ins_upd_time;
<a name="l02483"></a>02483 ret.update_best = calc_update_best;
<a name="l02484"></a>02484 ret.update_child = calc_update_child;
<a name="l02485"></a>02485 ret.update_nid = calc_update_nid;
<a name="l02486"></a>02486 ret.scv_acs_time = scv_acs_time;
<a name="l02487"></a>02487 ret.prune_time = prune_time;
<a name="l02488"></a>02488
<a name="l02489"></a>02489 RETURN ret;
<a name="l02490"></a>02490 END
<a name="l02491"></a>02491 $$ LANGUAGE PLPGSQL;
<a name="l02492"></a>02492
<a name="l02493"></a>02493
<a name="l02494"></a>02494 <span class="comment">/*</span>
<a name="l02495"></a>02495 <span class="comment"> * @brief This is an internal function for displaying one tree node in human </span>
<a name="l02496"></a>02496 <span class="comment"> * readable format. It is the step function of aggregation named </span>
<a name="l02497"></a>02497 <span class="comment"> * __display_tree_aggr.</span>
<a name="l02498"></a>02498 <span class="comment"> *</span>
<a name="l02499"></a>02499 <span class="comment"> * @param state This variable is used to store the accumulated tree </span>
<a name="l02500"></a>02500 <span class="comment"> * display information.</span>
<a name="l02501"></a>02501 <span class="comment"> * @param depth The depth of this node. </span>
<a name="l02502"></a>02502 <span class="comment"> * @param is_cont Whether the feature used to split is continuous. </span>
<a name="l02503"></a>02503 <span class="comment"> * @param feat_name The name of the feature used to split.</span>
<a name="l02504"></a>02504 <span class="comment"> * @param curr_val The value of the splitting feature for this node.</span>
<a name="l02505"></a>02505 <span class="comment"> * @param split_value For continuous feature, it specifies the split value. </span>
<a name="l02506"></a>02506 <span class="comment"> * Otherwise, it is of no meaning.</span>
<a name="l02507"></a>02507 <span class="comment"> * @param max_prob For those elements in this node, the probability that</span>
<a name="l02508"></a>02508 <span class="comment"> * an element belongs to the max_class.</span>
<a name="l02509"></a>02509 <span class="comment"> * @param max_class The class ID with the largest number of elements </span>
<a name="l02510"></a>02510 <span class="comment"> * for those elements in this node.</span>
<a name="l02511"></a>02511 <span class="comment"> * @param num_of_samples Total count of samples in this node. </span>
<a name="l02512"></a>02512 <span class="comment"> *</span>
<a name="l02513"></a>02513 <span class="comment"> * @return It returns the text containing the information of human </span>
<a name="l02514"></a>02514 <span class="comment"> * readable information for trees.</span>
<a name="l02515"></a>02515 <span class="comment"> *</span>
<a name="l02516"></a>02516 <span class="comment"> */</span>
<a name="l02517"></a>02517 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__display_node_sfunc
<a name="l02518"></a>02518 (
<a name="l02519"></a>02519 state TEXT,
<a name="l02520"></a>02520 depth INT,
<a name="l02521"></a>02521 is_cont BOOLEAN,
<a name="l02522"></a>02522 feat_name TEXT,
<a name="l02523"></a>02523 curr_val TEXT,
<a name="l02524"></a>02524 split_value FLOAT8,
<a name="l02525"></a>02525 max_prob FLOAT8,
<a name="l02526"></a>02526 max_class TEXT,
<a name="l02527"></a>02527 num_of_samples INT
<a name="l02528"></a>02528 )
<a name="l02529"></a>02529 RETURNS TEXT AS $$
<a name="l02530"></a>02530 DECLARE
<a name="l02531"></a>02531 ret TEXT := &#39;&#39;;
<a name="l02532"></a>02532 index INT;
<a name="l02533"></a>02533 BEGIN
<a name="l02534"></a>02534 -- We add indentation based on the depth.
<a name="l02535"></a>02535 FOR index IN 0..depth LOOP
<a name="l02536"></a>02536 ret = ret || &#39; &#39;;
<a name="l02537"></a>02537 END LOOP;
<a name="l02538"></a>02538
<a name="l02539"></a>02539 IF (depth &gt; 0) THEN
<a name="l02540"></a>02540 ret = ret ||coalesce(feat_name,&#39;null&#39;)||&#39;: &#39;;
<a name="l02541"></a>02541 -- For continuous features, there are two splits.
<a name="l02542"></a>02542 -- We will mark curr_val to 1 for &#39;&lt;=&#39;. Otherwise,
<a name="l02543"></a>02543 -- we will mark curr_val to 2.
<a name="l02544"></a>02544 IF (is_cont) THEN
<a name="l02545"></a>02545 IF (curr_val::INT = 1) THEN
<a name="l02546"></a>02546 ret = ret || &#39; &lt;= &#39;;
<a name="l02547"></a>02547 ELSE
<a name="l02548"></a>02548 ret = ret || &#39; &gt; &#39;;
<a name="l02549"></a>02549 END IF;
<a name="l02550"></a>02550 ret = ret||coalesce(split_value,0)||&#39; &#39;;
<a name="l02551"></a>02551 ELSE
<a name="l02552"></a>02552 ret = ret||&#39; = &#39;||coalesce(curr_val,&#39;null&#39;)||&#39; &#39;;
<a name="l02553"></a>02553 END IF;
<a name="l02554"></a>02554 ELSE
<a name="l02555"></a>02555 ret = ret||&#39;Root Node &#39;;
<a name="l02556"></a>02556 END IF;
<a name="l02557"></a>02557
<a name="l02558"></a>02558 ret = ret ||
<a name="l02559"></a>02559 &#39; : class(&#39; ||
<a name="l02560"></a>02560 coalesce(max_class,null) ||
<a name="l02561"></a>02561 &#39;) num_elements(&#39; ||
<a name="l02562"></a>02562 coalesce(num_of_samples,0) ||
<a name="l02563"></a>02563 &#39;) predict_prob(&#39; ||
<a name="l02564"></a>02564 coalesce(max_prob,0) ||
<a name="l02565"></a>02565 &#39;)&#39;;
<a name="l02566"></a>02566
<a name="l02567"></a>02567 ret = ret || E&#39;\n&#39;;
<a name="l02568"></a>02568
<a name="l02569"></a>02569 -- If there exists information, append the information
<a name="l02570"></a>02570 -- for this node.
<a name="l02571"></a>02571 IF (state IS NOT NULL) THEN
<a name="l02572"></a>02572 ret = state || ret;
<a name="l02573"></a>02573 END IF;
<a name="l02574"></a>02574
<a name="l02575"></a>02575 RETURN ret;
<a name="l02576"></a>02576 END
<a name="l02577"></a>02577 $$ LANGUAGE PLPGSQL;
<a name="l02578"></a>02578
<a name="l02579"></a>02579
<a name="l02580"></a>02580 DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__display_tree_aggr
<a name="l02581"></a>02581 (
<a name="l02582"></a>02582 INT, -- depth
<a name="l02583"></a>02583 BOOLEAN, -- is_cont
<a name="l02584"></a>02584 TEXT, -- feature name
<a name="l02585"></a>02585 TEXT, -- curr_val
<a name="l02586"></a>02586 FLOAT8, -- split value
<a name="l02587"></a>02587 FLOAT8, -- max_probability
<a name="l02588"></a>02588 TEXT, -- max_class
<a name="l02589"></a>02589 INT -- num_of_samples
<a name="l02590"></a>02590 ) CASCADE;
<a name="l02591"></a>02591 CREATE
<a name="l02592"></a>02592 m4_ifdef(`__GREENPLUM__&#39;, m4_ifdef(`__HAS_ORDERED_AGGREGATES__&#39;, `ORDERED&#39;))
<a name="l02593"></a>02593 AGGREGATE MADLIB_SCHEMA.__display_tree_aggr
<a name="l02594"></a>02594 (
<a name="l02595"></a>02595 INT, -- depth
<a name="l02596"></a>02596 BOOLEAN, -- is_cont
<a name="l02597"></a>02597 TEXT, -- feature name
<a name="l02598"></a>02598 TEXT, -- curr_val
<a name="l02599"></a>02599 FLOAT8, -- split value
<a name="l02600"></a>02600 FLOAT8, -- max_probability
<a name="l02601"></a>02601 TEXT, -- max_class
<a name="l02602"></a>02602 INT -- num_of_samples
<a name="l02603"></a>02603 )
<a name="l02604"></a>02604 (
<a name="l02605"></a>02605 SFUNC=MADLIB_SCHEMA.__display_node_sfunc,
<a name="l02606"></a>02606 STYPE=TEXT
<a name="l02607"></a>02607 );
<a name="l02608"></a>02608
<a name="l02609"></a>02609
<a name="l02610"></a>02610 <span class="comment">/*</span>
<a name="l02611"></a>02611 <span class="comment"> * @brief Display the trained model with human readable format. This function</span>
<a name="l02612"></a>02612 <span class="comment"> * leverages ordered aggregate to display the tree with only one scan of</span>
<a name="l02613"></a>02613 <span class="comment"> * the tree_table.</span>
<a name="l02614"></a>02614 <span class="comment"> *</span>
<a name="l02615"></a>02615 <span class="comment"> * @param tree_table The full name of the tree table. </span>
<a name="l02616"></a>02616 <span class="comment"> * @param tree_id The array contains the IDs of the trees to be displayed.</span>
<a name="l02617"></a>02617 <span class="comment"> * @param max_depth The max depth to be displayed. If it is set to null,</span>
<a name="l02618"></a>02618 <span class="comment"> * this function will show all levels. </span>
<a name="l02619"></a>02619 <span class="comment"> *</span>
<a name="l02620"></a>02620 <span class="comment"> * @return The text representing the tree with human readable format.</span>
<a name="l02621"></a>02621 <span class="comment"> *</span>
<a name="l02622"></a>02622 <span class="comment"> */</span>
<a name="l02623"></a>02623 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_display_with_ordered_aggr
<a name="l02624"></a>02624 (
<a name="l02625"></a>02625 tree_table TEXT,
<a name="l02626"></a>02626 tree_id INT[],
<a name="l02627"></a>02627 max_depth INT
<a name="l02628"></a>02628 )
<a name="l02629"></a>02629 RETURNS SETOF TEXT AS $$
<a name="l02630"></a>02630 DECLARE
<a name="l02631"></a>02631 metatable_name TEXT := null;
<a name="l02632"></a>02632 curr_stmt TEXT := null;
<a name="l02633"></a>02633 feature_name TEXT := null;
<a name="l02634"></a>02634 table_name TEXT := null;
<a name="l02635"></a>02635 result TEXT := &#39;&#39;;
<a name="l02636"></a>02636 result_rec RECORD;
<a name="l02637"></a>02637 BEGIN
<a name="l02638"></a>02638 PERFORM MADLIB_SCHEMA.__assert_table
<a name="l02639"></a>02639 (
<a name="l02640"></a>02640 tree_table,
<a name="l02641"></a>02641 &#39;t&#39;
<a name="l02642"></a>02642 );
<a name="l02643"></a>02643
<a name="l02644"></a>02644 metatable_name = MADLIB_SCHEMA.__get_metatable_name( tree_table );
<a name="l02645"></a>02645
<a name="l02646"></a>02646 -- This table is used for tree display.
<a name="l02647"></a>02647 -- It is filled with the original information before
<a name="l02648"></a>02648 -- encoding to facilitate the display procedure.
<a name="l02649"></a>02649 DROP TABLE IF EXISTS auxiliary_tree_display;
<a name="l02650"></a>02650 CREATE TEMP TABLE auxiliary_tree_display
<a name="l02651"></a>02651 (
<a name="l02652"></a>02652 tid INT,
<a name="l02653"></a>02653 id INT,
<a name="l02654"></a>02654 tree_location INT[],
<a name="l02655"></a>02655 probability FLOAT8,
<a name="l02656"></a>02656 max_class TEXT,
<a name="l02657"></a>02657 num_of_samples INT,
<a name="l02658"></a>02658 parent_id INT,
<a name="l02659"></a>02659 curr_value TEXT,
<a name="l02660"></a>02660 parent_feature_id INT,
<a name="l02661"></a>02661 is_parent_feature_cont BOOLEAN,
<a name="l02662"></a>02662 parent_split_value FLOAT8,
<a name="l02663"></a>02663 parent_feature_name TEXT
<a name="l02664"></a>02664 ) m4_ifdef(`__GREENPLUM__&#39;, `DISTRIBUTED BY (id)&#39;);
<a name="l02665"></a>02665
<a name="l02666"></a>02666 -- We made a self join for the tree table. For each node, we get the
<a name="l02667"></a>02667 -- feature information at its parent node so as to display this node.
<a name="l02668"></a>02668 SELECT MADLIB_SCHEMA.__format(
<a name="l02669"></a>02669 &#39;INSERT INTO auxiliary_tree_display SELECT m.*,
<a name="l02670"></a>02670 n.column_name as parent_feature_name
<a name="l02671"></a>02671 FROM
<a name="l02672"></a>02672 (SELECT * FROM
<a name="l02673"></a>02673 (SELECT t1.tid,t1.id, t1.tree_location,
<a name="l02674"></a>02674 t1.probability,t1.max_class::TEXT,
<a name="l02675"></a>02675 t1.num_of_samples,t1.parent_id,
<a name="l02676"></a>02676 t1.tree_location[array_upper(t1.tree_location,1)]::TEXT
<a name="l02677"></a>02677 as curr_value,
<a name="l02678"></a>02678 t2.feature as parent_feature_id,
<a name="l02679"></a>02679 t2.is_cont as is_parent_feature_cont,
<a name="l02680"></a>02680 t2.split_value as parent_split_value
<a name="l02681"></a>02681 FROM % t1 LEFT JOIN % t2 ON
<a name="l02682"></a>02682 (t1.parent_id = t2.id AND
<a name="l02683"></a>02683 (coalesce(t1.tid,0)=coalesce(t2.tid,0)) ) ) l
<a name="l02684"></a>02684 WHERE l.tid in ( % ) ) m
<a name="l02685"></a>02685 LEFT JOIN % n
<a name="l02686"></a>02686 on m.parent_feature_id = n.id;&#39;,
<a name="l02687"></a>02687 ARRAY[
<a name="l02688"></a>02688 tree_table,
<a name="l02689"></a>02689 tree_table,
<a name="l02690"></a>02690 array_to_string(tree_id,&#39;,&#39;),
<a name="l02691"></a>02691 metatable_name
<a name="l02692"></a>02692 ]
<a name="l02693"></a>02693 )
<a name="l02694"></a>02694 INTO curr_stmt;
<a name="l02695"></a>02695 EXECUTE curr_stmt;
<a name="l02696"></a>02696
<a name="l02697"></a>02697 -- Get the metatable storing the encoding information of class.
<a name="l02698"></a>02698 SELECT MADLIB_SCHEMA.__format
<a name="l02699"></a>02699 (
<a name="l02700"></a>02700 &#39;SELECT
<a name="l02701"></a>02701 column_name,
<a name="l02702"></a>02702 MADLIB_SCHEMA.__regclass_to_text(table_oid) as table_name
<a name="l02703"></a>02703 FROM %
<a name="l02704"></a>02704 WHERE column_type=&#39;&#39;c&#39;&#39; LIMIT 1&#39;,
<a name="l02705"></a>02705 ARRAY[
<a name="l02706"></a>02706 metatable_name
<a name="l02707"></a>02707 ]
<a name="l02708"></a>02708 ) INTO curr_stmt;
<a name="l02709"></a>02709
<a name="l02710"></a>02710 EXECUTE curr_stmt INTO result_rec;
<a name="l02711"></a>02711
<a name="l02712"></a>02712 table_name = result_rec.table_name;
<a name="l02713"></a>02713
<a name="l02714"></a>02714 IF (table_name IS NOT NULL) THEN
<a name="l02715"></a>02715 -- Convert back for the class column.
<a name="l02716"></a>02716 SELECT MADLIB_SCHEMA.__format(
<a name="l02717"></a>02717 &#39;UPDATE auxiliary_tree_display n
<a name="l02718"></a>02718 SET max_class = MADLIB_SCHEMA.__to_char(m.fval)
<a name="l02719"></a>02719 FROM % m
<a name="l02720"></a>02720 WHERE m.code = n.max_class::INT
<a name="l02721"></a>02721 &#39;,
<a name="l02722"></a>02722 ARRAY[
<a name="l02723"></a>02723 table_name
<a name="l02724"></a>02724 ]
<a name="l02725"></a>02725 )
<a name="l02726"></a>02726 INTO curr_stmt;
<a name="l02727"></a>02727 EXECUTE curr_stmt;
<a name="l02728"></a>02728 END IF;
<a name="l02729"></a>02729
<a name="l02730"></a>02730 -- Get the metatables storing the encoding information for discrete features.
<a name="l02731"></a>02731 SELECT MADLIB_SCHEMA.__format
<a name="l02732"></a>02732 (
<a name="l02733"></a>02733 &#39;SELECT
<a name="l02734"></a>02734 id,
<a name="l02735"></a>02735 column_name,
<a name="l02736"></a>02736 MADLIB_SCHEMA.__regclass_to_text(table_oid) as table_name
<a name="l02737"></a>02737 FROM %
<a name="l02738"></a>02738 WHERE NOT is_cont AND column_type=&#39;&#39;f&#39;&#39;;&#39;,
<a name="l02739"></a>02739 ARRAY[
<a name="l02740"></a>02740 metatable_name
<a name="l02741"></a>02741 ]
<a name="l02742"></a>02742 )
<a name="l02743"></a>02743 INTO curr_stmt;
<a name="l02744"></a>02744
<a name="l02745"></a>02745 -- Convert back for discrete features.
<a name="l02746"></a>02746 FOR result_rec IN EXECUTE (curr_stmt) LOOP
<a name="l02747"></a>02747 SELECT MADLIB_SCHEMA.__format(
<a name="l02748"></a>02748 &#39;UPDATE auxiliary_tree_display n
<a name="l02749"></a>02749 SET curr_value = MADLIB_SCHEMA.__to_char(m.fval)
<a name="l02750"></a>02750 FROM % m
<a name="l02751"></a>02751 WHERE m.code::INT = n.curr_value::INT AND
<a name="l02752"></a>02752 m.fid = % AND
<a name="l02753"></a>02753 n.parent_feature_name = %
<a name="l02754"></a>02754 &#39;,
<a name="l02755"></a>02755 ARRAY[
<a name="l02756"></a>02756 result_rec.table_name,
<a name="l02757"></a>02757 result_rec.id::TEXT,
<a name="l02758"></a>02758 quote_literal(result_rec.column_name)
<a name="l02759"></a>02759 ]
<a name="l02760"></a>02760 )
<a name="l02761"></a>02761 INTO curr_stmt;
<a name="l02762"></a>02762 EXECUTE curr_stmt;
<a name="l02763"></a>02763 END LOOP;
<a name="l02764"></a>02764
<a name="l02765"></a>02765 -- Now we already get all the information. Invoke the
<a name="l02766"></a>02766 -- aggregation to show the tree.
<a name="l02767"></a>02767 -- If we order by tree_location, we can get the sequence
<a name="l02768"></a>02768 -- of depth first traversal.
<a name="l02769"></a>02769 curr_stmt = &#39;SELECT tid,MADLIB_SCHEMA.__display_tree_aggr(
<a name="l02770"></a>02770 array_upper(tree_location,1)-1,
<a name="l02771"></a>02771 is_parent_feature_cont,
<a name="l02772"></a>02772 parent_feature_name,
<a name="l02773"></a>02773 curr_value,
<a name="l02774"></a>02774 parent_split_value,
<a name="l02775"></a>02775 probability,
<a name="l02776"></a>02776 max_class,
<a name="l02777"></a>02777 num_of_samples
<a name="l02778"></a>02778 order by tree_location) AS disp_str
<a name="l02779"></a>02779 FROM auxiliary_tree_display&#39;;
<a name="l02780"></a>02780
<a name="l02781"></a>02781 IF (max_depth IS NOT NULL) THEN
<a name="l02782"></a>02782 curr_stmt = curr_stmt ||
<a name="l02783"></a>02783 &#39; WHERE array_upper(tree_location,1) - 1 &lt;=&#39; ||
<a name="l02784"></a>02784 max_depth;
<a name="l02785"></a>02785 END IF;
<a name="l02786"></a>02786
<a name="l02787"></a>02787 curr_stmt = curr_stmt||&#39; GROUP BY tid ORDER BY tid;&#39;;
<a name="l02788"></a>02788
<a name="l02789"></a>02789 FOR result_rec IN EXECUTE curr_stmt LOOP
<a name="l02790"></a>02790 SELECT MADLIB_SCHEMA.__format(
<a name="l02791"></a>02791 E&#39;\nTree %\n%&#39;,
<a name="l02792"></a>02792 ARRAY[
<a name="l02793"></a>02793 result_rec.tid::TEXT,
<a name="l02794"></a>02794 result_rec.disp_str
<a name="l02795"></a>02795 ]
<a name="l02796"></a>02796 )
<a name="l02797"></a>02797 INTO result;
<a name="l02798"></a>02798 RETURN NEXT result;
<a name="l02799"></a>02799 --RETURN NEXT E&#39;\nTree &#39;||result_rec.tid||E&#39;\n&#39;||result_rec.disp_str;
<a name="l02800"></a>02800 END LOOP;
<a name="l02801"></a>02801 RETURN;
<a name="l02802"></a>02802 END $$ LANGUAGE PLPGSQL;
<a name="l02803"></a>02803
<a name="l02804"></a>02804
<a name="l02805"></a>02805 <span class="comment">/*</span>
<a name="l02806"></a>02806 <span class="comment"> * @brief This is an internal function for displaying the tree in human readable</span>
<a name="l02807"></a>02807 <span class="comment"> * format. It uses the depth-first strategy to traverse a tree and print </span>
<a name="l02808"></a>02808 <span class="comment"> * values. This function is used on databases, e.g. PG 8.4, that do not </span>
<a name="l02809"></a>02809 <span class="comment"> * support ordered aggregate.</span>
<a name="l02810"></a>02810 <span class="comment"> *</span>
<a name="l02811"></a>02811 <span class="comment"> * @param tree_table The full name of the tree table. </span>
<a name="l02812"></a>02812 <span class="comment"> * @param id The ID of current node. This node and all of its </span>
<a name="l02813"></a>02813 <span class="comment"> * children are displayed.</span>
<a name="l02814"></a>02814 <span class="comment"> * @param feature_id The ID of a feature, which is used to split in the </span>
<a name="l02815"></a>02815 <span class="comment"> * parent of current node.</span>
<a name="l02816"></a>02816 <span class="comment"> * @param depth The depth of current node.</span>
<a name="l02817"></a>02817 <span class="comment"> * @param is_cont It specifies whether the feature denoted by &#39;feature_id&#39;</span>
<a name="l02818"></a>02818 <span class="comment"> * is continuous or not.</span>
<a name="l02819"></a>02819 <span class="comment"> * @param split_value For continuous feature, it specifies the split value. </span>
<a name="l02820"></a>02820 <span class="comment"> * Otherwise, it is of no meaning.</span>
<a name="l02821"></a>02821 <span class="comment"> * @param metatable_name For tabular format, this table contains the meta data</span>
<a name="l02822"></a>02822 <span class="comment"> * to encode the input table.</span>
<a name="l02823"></a>02823 <span class="comment"> * @param max_depth The max depth to be displayed. If it is set to null,</span>
<a name="l02824"></a>02824 <span class="comment"> * this function will show all levels. </span>
<a name="l02825"></a>02825 <span class="comment"> * @param tree_id The ID of the tree to be displayed.</span>
<a name="l02826"></a>02826 <span class="comment"> *</span>
<a name="l02827"></a>02827 <span class="comment"> * @return The text representing the tree with human readable format.</span>
<a name="l02828"></a>02828 <span class="comment"> *</span>
<a name="l02829"></a>02829 <span class="comment"> */</span>
<a name="l02830"></a>02830 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__display_tree_no_ordered_aggr
<a name="l02831"></a>02831 (
<a name="l02832"></a>02832 tree_table TEXT,
<a name="l02833"></a>02833 id INT,
<a name="l02834"></a>02834 feature_id INT,
<a name="l02835"></a>02835 depth INT,
<a name="l02836"></a>02836 is_cont BOOLEAN,
<a name="l02837"></a>02837 split_value FLOAT,
<a name="l02838"></a>02838 metatable_name TEXT,
<a name="l02839"></a>02839 max_depth INT,
<a name="l02840"></a>02840 tree_id INT
<a name="l02841"></a>02841 )
<a name="l02842"></a>02842 RETURNS TEXT AS $$
<a name="l02843"></a>02843 DECLARE
<a name="l02844"></a>02844 ret TEXT := &#39;&#39;;
<a name="l02845"></a>02845 tree_location INT[];
<a name="l02846"></a>02846 feature INT;
<a name="l02847"></a>02847 max_class INT;
<a name="l02848"></a>02848 num_of_samples INT;
<a name="l02849"></a>02849 is_cont BOOLEAN;
<a name="l02850"></a>02850 temp_split_value FLOAT;
<a name="l02851"></a>02851 index INT;
<a name="l02852"></a>02852 curr_value INT;
<a name="l02853"></a>02853 probability FLOAT;
<a name="l02854"></a>02854 curstmt TEXT;
<a name="l02855"></a>02855 child_nid INT;
<a name="l02856"></a>02856 BEGIN
<a name="l02857"></a>02857 IF (id IS NULL OR id &lt;= 0) THEN
<a name="l02858"></a>02858 RETURN ret;
<a name="l02859"></a>02859 END IF;
<a name="l02860"></a>02860
<a name="l02861"></a>02861 SELECT MADLIB_SCHEMA.__format
<a name="l02862"></a>02862 (
<a name="l02863"></a>02863 &#39;SELECT tree_location, feature, is_cont,
<a name="l02864"></a>02864 split_value, max_class,num_of_samples,probability
<a name="l02865"></a>02865 FROM %
<a name="l02866"></a>02866 WHERE id = % AND tid=%&#39;,
<a name="l02867"></a>02867 ARRAY[
<a name="l02868"></a>02868 tree_table,
<a name="l02869"></a>02869 MADLIB_SCHEMA.__to_char(id),
<a name="l02870"></a>02870 MADLIB_SCHEMA.__to_char(tree_id)
<a name="l02871"></a>02871 ]
<a name="l02872"></a>02872 )
<a name="l02873"></a>02873 INTO curstmt;
<a name="l02874"></a>02874
<a name="l02875"></a>02875 EXECUTE curstmt INTO tree_location, feature, is_cont,
<a name="l02876"></a>02876 temp_split_value, max_class, num_of_samples, probability;
<a name="l02877"></a>02877
<a name="l02878"></a>02878 curr_value = tree_location[array_upper(tree_location,1)];
<a name="l02879"></a>02879
<a name="l02880"></a>02880 FOR index IN 0..depth LOOP
<a name="l02881"></a>02881 ret = ret || &#39; &#39;;
<a name="l02882"></a>02882 END LOOP;
<a name="l02883"></a>02883
<a name="l02884"></a>02884 IF (id &gt;tree_id) THEN
<a name="l02885"></a>02885 ret = ret ||MADLIB_SCHEMA.__get_feature_name(feature_id,metatable_name)||&#39;: &#39;;
<a name="l02886"></a>02886
<a name="l02887"></a>02887 IF (is_cont) THEN
<a name="l02888"></a>02888 IF (curr_value = 1) THEN
<a name="l02889"></a>02889 ret = ret || &#39; &lt;= &#39;;
<a name="l02890"></a>02890 ELSE
<a name="l02891"></a>02891 ret = ret || &#39; &gt; &#39;;
<a name="l02892"></a>02892 END IF;
<a name="l02893"></a>02893 ret = ret || split_value;
<a name="l02894"></a>02894 ELSE
<a name="l02895"></a>02895 ret = ret ||
<a name="l02896"></a>02896 &#39; = &#39; ||
<a name="l02897"></a>02897 MADLIB_SCHEMA.__get_feature_value
<a name="l02898"></a>02898 (
<a name="l02899"></a>02899 feature_id,
<a name="l02900"></a>02900 curr_value,
<a name="l02901"></a>02901 metatable_name
<a name="l02902"></a>02902 );
<a name="l02903"></a>02903 END IF;
<a name="l02904"></a>02904 ELSE
<a name="l02905"></a>02905 ret = ret||&#39;Root Node &#39;;
<a name="l02906"></a>02906 END IF;
<a name="l02907"></a>02907
<a name="l02908"></a>02908 ret = ret ||
<a name="l02909"></a>02909 &#39; : class(&#39; ||
<a name="l02910"></a>02910 MADLIB_SCHEMA.__get_class_value(max_class,metatable_name) ||
<a name="l02911"></a>02911 &#39;) num_elements(&#39; ||
<a name="l02912"></a>02912 num_of_samples ||
<a name="l02913"></a>02913 &#39;) predict_prob(&#39; ||
<a name="l02914"></a>02914 probability ||
<a name="l02915"></a>02915 &#39;)&#39;;
<a name="l02916"></a>02916
<a name="l02917"></a>02917 ret = ret || E&#39;\n&#39;;
<a name="l02918"></a>02918
<a name="l02919"></a>02919 IF (max_depth IS NOT NULL AND
<a name="l02920"></a>02920 depth &gt;= max_depth) THEN
<a name="l02921"></a>02921 RETURN ret;
<a name="l02922"></a>02922 END IF;
<a name="l02923"></a>02923
<a name="l02924"></a>02924 curstmt = MADLIB_SCHEMA.__format
<a name="l02925"></a>02925 (
<a name="l02926"></a>02926 &#39;SELECT id
<a name="l02927"></a>02927 FROM %
<a name="l02928"></a>02928 WHERE parent_id = % AND tid=%
<a name="l02929"></a>02929 ORDER BY id&#39;,
<a name="l02930"></a>02930 ARRAY[
<a name="l02931"></a>02931 tree_table,
<a name="l02932"></a>02932 MADLIB_SCHEMA.__to_char(id),
<a name="l02933"></a>02933 MADLIB_SCHEMA.__to_char(tree_id)
<a name="l02934"></a>02934 ]
<a name="l02935"></a>02935 );
<a name="l02936"></a>02936
<a name="l02937"></a>02937 FOR child_nid IN EXECUTE curstmt LOOP
<a name="l02938"></a>02938 ret = ret || MADLIB_SCHEMA.__display_tree_no_ordered_aggr(
<a name="l02939"></a>02939 tree_table,
<a name="l02940"></a>02940 child_nid,
<a name="l02941"></a>02941 feature,
<a name="l02942"></a>02942 depth + 1,
<a name="l02943"></a>02943 is_cont,
<a name="l02944"></a>02944 temp_split_value,
<a name="l02945"></a>02945 metatable_name,
<a name="l02946"></a>02946 max_depth,
<a name="l02947"></a>02947 tree_id);
<a name="l02948"></a>02948 END LOOP;
<a name="l02949"></a>02949
<a name="l02950"></a>02950 RETURN ret;
<a name="l02951"></a>02951 END $$ LANGUAGE PLPGSQL;
<a name="l02952"></a>02952
<a name="l02953"></a>02953
<a name="l02954"></a>02954 <span class="comment">/*</span>
<a name="l02955"></a>02955 <span class="comment"> * @brief Display the trained model with human readable format. It use the </span>
<a name="l02956"></a>02956 <span class="comment"> * recursive algorithm, which is slower than the version with </span>
<a name="l02957"></a>02957 <span class="comment"> * ordered aggregate. We only use it when ordered aggregate is unavailable.</span>
<a name="l02958"></a>02958 <span class="comment"> *</span>
<a name="l02959"></a>02959 <span class="comment"> * @param tree_table The full name of the tree table. </span>
<a name="l02960"></a>02960 <span class="comment"> * @param tree_id The array contains the IDs of the trees to be displayed.</span>
<a name="l02961"></a>02961 <span class="comment"> * @param max_depth The max depth to be displayed. If it is set to null,</span>
<a name="l02962"></a>02962 <span class="comment"> * this function will show all levels. </span>
<a name="l02963"></a>02963 <span class="comment"> *</span>
<a name="l02964"></a>02964 <span class="comment"> * @return The text representing the tree with human readable format.</span>
<a name="l02965"></a>02965 <span class="comment"> *</span>
<a name="l02966"></a>02966 <span class="comment"> */</span>
<a name="l02967"></a>02967 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_display_no_ordered_aggr
<a name="l02968"></a>02968 (
<a name="l02969"></a>02969 tree_table TEXT,
<a name="l02970"></a>02970 tree_id INT[],
<a name="l02971"></a>02971 max_depth INT
<a name="l02972"></a>02972 )
<a name="l02973"></a>02973 RETURNS SETOF TEXT AS $$
<a name="l02974"></a>02974 DECLARE
<a name="l02975"></a>02975 metatable_name TEXT := null;
<a name="l02976"></a>02976 curstmt TEXT := &#39;&#39;;
<a name="l02977"></a>02977 index INT;
<a name="l02978"></a>02978 result TEXT := &#39;&#39;;
<a name="l02979"></a>02979 root_id INT;
<a name="l02980"></a>02980 BEGIN
<a name="l02981"></a>02981 PERFORM MADLIB_SCHEMA.__assert_table
<a name="l02982"></a>02982 (
<a name="l02983"></a>02983 tree_table,
<a name="l02984"></a>02984 &#39;t&#39;
<a name="l02985"></a>02985 );
<a name="l02986"></a>02986
<a name="l02987"></a>02987 metatable_name = MADLIB_SCHEMA.__get_metatable_name( tree_table );
<a name="l02988"></a>02988
<a name="l02989"></a>02989 index= array_lower(tree_id,1);
<a name="l02990"></a>02990
<a name="l02991"></a>02991 WHILE (index&lt;=array_upper(tree_id,1) ) LOOP
<a name="l02992"></a>02992 EXECUTE &#39;SELECT id FROM &#39;||tree_table||
<a name="l02993"></a>02993 &#39; WHERE parent_id=0 and tid=&#39;||tree_id[index]||&#39;;&#39; INTO root_id;
<a name="l02994"></a>02994
<a name="l02995"></a>02995 RETURN NEXT E&#39;\nTree &#39;||tree_id[index]||E&#39;\n&#39;||
<a name="l02996"></a>02996 MADLIB_SCHEMA.__display_tree_no_ordered_aggr(tree_table, root_id, 0, 0, &#39;f&#39;,
<a name="l02997"></a>02997 0, metatable_name,max_depth,tree_id[index]);
<a name="l02998"></a>02998 index=index+1;
<a name="l02999"></a>02999 END LOOP;
<a name="l03000"></a>03000 RETURN;
<a name="l03001"></a>03001 END $$ LANGUAGE PLPGSQL;
<a name="l03002"></a>03002
<a name="l03003"></a>03003
<a name="l03004"></a>03004 <span class="comment">/*</span>
<a name="l03005"></a>03005 <span class="comment"> * @brief Multiple trees may classify the same record to different classes. </span>
<a name="l03006"></a>03006 <span class="comment"> * This function gets the results voted by multiple trees.</span>
<a name="l03007"></a>03007 <span class="comment"> *</span>
<a name="l03008"></a>03008 <span class="comment"> * @param src_table The full name of the table containing original data.</span>
<a name="l03009"></a>03009 <span class="comment"> * @param dst_table The full name of the table to store the voted results. </span>
<a name="l03010"></a>03010 <span class="comment"> *</span>
<a name="l03011"></a>03011 <span class="comment"> */</span>
<a name="l03012"></a>03012 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_get_vote_result
<a name="l03013"></a>03013 (
<a name="l03014"></a>03014 src_table TEXT,
<a name="l03015"></a>03015 dst_table TEXT
<a name="l03016"></a>03016 )
<a name="l03017"></a>03017 RETURNS VOID AS $$
<a name="l03018"></a>03018 DECLARE
<a name="l03019"></a>03019 curstmt TEXT;
<a name="l03020"></a>03020 BEGIN
<a name="l03021"></a>03021 EXECUTE &#39;DROP TABLE IF EXISTS &#39;||dst_table;
<a name="l03022"></a>03022 EXECUTE &#39;CREATE TEMP TABLE &#39;||dst_table||E&#39;
<a name="l03023"></a>03023 (
<a name="l03024"></a>03024 id BIGINT,
<a name="l03025"></a>03025 class INT,
<a name="l03026"></a>03026 prob FLOAT8
<a name="l03027"></a>03027 )m4_ifdef(`__GREENPLUM__&#39;, `DISTRIBUTED BY (id)&#39;);&#39;;
<a name="l03028"></a>03028
<a name="l03029"></a>03029 SELECT MADLIB_SCHEMA.__format(
<a name="l03030"></a>03030 &#39;INSERT INTO %
<a name="l03031"></a>03031 SELECT id, max_array[3], max_array[2] FROM
<a name="l03032"></a>03032 (SELECT id, max(array[count,prob,class]) AS max_array FROM
<a name="l03033"></a>03033 (SELECT id, class, COUNT(*) AS count, AVG(prob) as prob FROM
<a name="l03034"></a>03034 % GROUP BY id,class) t1 GROUP BY id) t2&#39;,
<a name="l03035"></a>03035 ARRAY[
<a name="l03036"></a>03036 dst_table,
<a name="l03037"></a>03037 src_table
<a name="l03038"></a>03038 ]
<a name="l03039"></a>03039 )
<a name="l03040"></a>03040 INTO curstmt;
<a name="l03041"></a>03041 EXECUTE curstmt;
<a name="l03042"></a>03042 RETURN;
<a name="l03043"></a>03043 END
<a name="l03044"></a>03044 $$ LANGUAGE PLPGSQL;
<a name="l03045"></a>03045
<a name="l03046"></a>03046
<a name="l03047"></a>03047 <span class="comment">/*</span>
<a name="l03048"></a>03048 <span class="comment"> * @brief An internal classification function. It classifies with all trees at </span>
<a name="l03049"></a>03049 <span class="comment"> * the same time. For medium/small data sets, tests shows that it is more</span>
<a name="l03050"></a>03050 <span class="comment"> * efficient than the serial classification function. </span>
<a name="l03051"></a>03051 <span class="comment"> *</span>
<a name="l03052"></a>03052 <span class="comment"> * @param classification_table_name The full name of the table containing the </span>
<a name="l03053"></a>03053 <span class="comment"> * classification set.</span>
<a name="l03054"></a>03054 <span class="comment"> * @param tree_table_name The full name of the tree table.</span>
<a name="l03055"></a>03055 <span class="comment"> * @param verbosity &gt; 0 means this function runs in verbose mode. </span>
<a name="l03056"></a>03056 <span class="comment"> *</span>
<a name="l03057"></a>03057 <span class="comment"> * @return An array containing the encoded table name and classification result </span>
<a name="l03058"></a>03058 <span class="comment"> * table name (We encode the source table during the classification).</span>
<a name="l03059"></a>03059 <span class="comment"> *</span>
<a name="l03060"></a>03060 <span class="comment"> */</span>
<a name="l03061"></a>03061 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_classify_internal
<a name="l03062"></a>03062 (
<a name="l03063"></a>03063 classification_table_name TEXT,
<a name="l03064"></a>03064 tree_table_name TEXT,
<a name="l03065"></a>03065 verbosity INT
<a name="l03066"></a>03066 )
<a name="l03067"></a>03067 RETURNS TEXT[] AS $$
<a name="l03068"></a>03068 DECLARE
<a name="l03069"></a>03069 table_pick INT := 1;
<a name="l03070"></a>03070 remains_to_classify INT;
<a name="l03071"></a>03071 size_finished INT;
<a name="l03072"></a>03072 time_stamp TIMESTAMP;
<a name="l03073"></a>03073 metatable_name TEXT := &#39;&#39;;
<a name="l03074"></a>03074 id_col_name TEXT := &#39;id&#39;;
<a name="l03075"></a>03075 curr_level INT := 1;
<a name="l03076"></a>03076 max_level INT := 0;
<a name="l03077"></a>03077 h2hmv_routine_id INT := 0;
<a name="l03078"></a>03078 curstmt TEXT := &#39;&#39;;
<a name="l03079"></a>03079 result_table_name TEXT := &#39;dt_classify_internal_rt&#39;;
<a name="l03080"></a>03080 encoded_table_name TEXT := &#39;dt_classify_internal_edt&#39;;
<a name="l03081"></a>03081 table_names TEXT[] := &#39;{classified_instance_ping,classified_instance_pong}&#39;;
<a name="l03082"></a>03082 tree_id INT;
<a name="l03083"></a>03083 BEGIN
<a name="l03084"></a>03084 time_stamp = clock_timestamp();
<a name="l03085"></a>03085
<a name="l03086"></a>03086 PERFORM MADLIB_SCHEMA.__assert
<a name="l03087"></a>03087 (
<a name="l03088"></a>03088 (classification_table_name IS NOT NULL) AND
<a name="l03089"></a>03089 (
<a name="l03090"></a>03090 MADLIB_SCHEMA.__table_exists
<a name="l03091"></a>03091 (
<a name="l03092"></a>03092 classification_table_name
<a name="l03093"></a>03093 )
<a name="l03094"></a>03094 ),
<a name="l03095"></a>03095 &#39;the specified classification table&#39; ||
<a name="l03096"></a>03096 coalesce(&#39;&lt;&#39; || classification_table_name ||
<a name="l03097"></a>03097 &#39;&gt; does not exists&#39;, &#39; is NULL&#39;)
<a name="l03098"></a>03098 );
<a name="l03099"></a>03099
<a name="l03100"></a>03100 PERFORM MADLIB_SCHEMA.__assert
<a name="l03101"></a>03101 (
<a name="l03102"></a>03102 (tree_table_name IS NOT NULL) AND
<a name="l03103"></a>03103 (
<a name="l03104"></a>03104 MADLIB_SCHEMA.__table_exists
<a name="l03105"></a>03105 (
<a name="l03106"></a>03106 tree_table_name
<a name="l03107"></a>03107 )
<a name="l03108"></a>03108 ),
<a name="l03109"></a>03109 &#39;the specified tree table&#39; ||
<a name="l03110"></a>03110 coalesce(&#39;&lt;&#39; || tree_table_name || &#39;&gt; does not exists&#39;, &#39; is NULL&#39;)
<a name="l03111"></a>03111 );
<a name="l03112"></a>03112
<a name="l03113"></a>03113 PERFORM MADLIB_SCHEMA.__assert
<a name="l03114"></a>03114 (
<a name="l03115"></a>03115 verbosity IS NOT NULL,
<a name="l03116"></a>03116 &#39;verbosity must be non-null&#39;
<a name="l03117"></a>03117 );
<a name="l03118"></a>03118
<a name="l03119"></a>03119 EXECUTE &#39;DROP TABLE IF EXISTS &#39; || encoded_table_name || &#39; CASCADE&#39;;
<a name="l03120"></a>03120
<a name="l03121"></a>03121 SELECT MADLIB_SCHEMA.__get_metatable_name(tree_table_name) INTO metatable_name;
<a name="l03122"></a>03122
<a name="l03123"></a>03123 SELECT MADLIB_SCHEMA.__get_routine_id(tree_table_name) INTO h2hmv_routine_id;
<a name="l03124"></a>03124
<a name="l03125"></a>03125 PERFORM MADLIB_SCHEMA.__encode_table
<a name="l03126"></a>03126 (
<a name="l03127"></a>03127 classification_table_name,
<a name="l03128"></a>03128 encoded_table_name,
<a name="l03129"></a>03129 metatable_name,
<a name="l03130"></a>03130 h2hmv_routine_id,
<a name="l03131"></a>03131 verbosity
<a name="l03132"></a>03132 );
<a name="l03133"></a>03133
<a name="l03134"></a>03134 IF (verbosity &gt; 0) THEN
<a name="l03135"></a>03135 RAISE INFO &#39;tabular format. id_col_name: %&#39;, id_col_name;
<a name="l03136"></a>03136 END IF;
<a name="l03137"></a>03137
<a name="l03138"></a>03138 <span class="comment">/*</span>
<a name="l03139"></a>03139 <span class="comment"> * The table of classified_instance_ping and classified_instance_pong are</span>
<a name="l03140"></a>03140 <span class="comment"> * auxiliary tables used during the classification process.</span>
<a name="l03141"></a>03141 <span class="comment"> * For each record, these tables tell us which node it belongs to. They also</span>
<a name="l03142"></a>03142 <span class="comment"> * hold the information of class and probability.</span>
<a name="l03143"></a>03143 <span class="comment"> * We use transfer data between these two tables rather than update a single</span>
<a name="l03144"></a>03144 <span class="comment"> * table during the classification process. We find the operation of update</span>
<a name="l03145"></a>03145 <span class="comment"> * is quite expensive.</span>
<a name="l03146"></a>03146 <span class="comment"> */</span>
<a name="l03147"></a>03147 DROP TABLE IF EXISTS classified_instance_ping;
<a name="l03148"></a>03148 CREATE TEMP TABLE classified_instance_ping
<a name="l03149"></a>03149 (
<a name="l03150"></a>03150 tid INT,
<a name="l03151"></a>03151 id BIGINT,
<a name="l03152"></a>03152 jump INT,
<a name="l03153"></a>03153 class INT,
<a name="l03154"></a>03154 prob FLOAT,
<a name="l03155"></a>03155 parent_id INT,
<a name="l03156"></a>03156 leaf_id INT
<a name="l03157"></a>03157 ) m4_ifdef(`__GREENPLUM__&#39;, `DISTRIBUTED BY (id)&#39;);
<a name="l03158"></a>03158
<a name="l03159"></a>03159 DROP TABLE IF EXISTS classified_instance_pong;
<a name="l03160"></a>03160 CREATE TEMP TABLE classified_instance_pong
<a name="l03161"></a>03161 (
<a name="l03162"></a>03162 tid INT,
<a name="l03163"></a>03163 id BIGINT,
<a name="l03164"></a>03164 jump INT,
<a name="l03165"></a>03165 class INT,
<a name="l03166"></a>03166 prob FLOAT,
<a name="l03167"></a>03167 parent_id INT,
<a name="l03168"></a>03168 leaf_id INT
<a name="l03169"></a>03169 ) m4_ifdef(`__GREENPLUM__&#39;, `DISTRIBUTED BY (id)&#39;);
<a name="l03170"></a>03170
<a name="l03171"></a>03171
<a name="l03172"></a>03172 EXECUTE &#39;DROP TABLE IF EXISTS &#39; || result_table_name || &#39; CASCADE&#39;;
<a name="l03173"></a>03173 EXECUTE &#39;CREATE TEMP TABLE &#39; || result_table_name || E&#39;
<a name="l03174"></a>03174 (
<a name="l03175"></a>03175 tid INT,
<a name="l03176"></a>03176 id BIGINT,
<a name="l03177"></a>03177 jump INT,
<a name="l03178"></a>03178 class INT,
<a name="l03179"></a>03179 prob FLOAT,
<a name="l03180"></a>03180 parent_id INT,
<a name="l03181"></a>03181 leaf_id INT
<a name="l03182"></a>03182 ) m4_ifdef(`__GREENPLUM__&#39;, `DISTRIBUTED BY (id)&#39;);&#39;;
<a name="l03183"></a>03183
<a name="l03184"></a>03184
<a name="l03185"></a>03185 EXECUTE &#39;INSERT INTO classified_instance_ping (id, jump, class, prob,tid)
<a name="l03186"></a>03186 SELECT m.&#39;||id_col_name||&#39;, t.id, 0, 0, t.tid
<a name="l03187"></a>03187 FROM &#39; || encoded_table_name || &#39; m CROSS JOIN
<a name="l03188"></a>03188 (SELECT DISTINCT tid,id FROM &#39;||tree_table_name||&#39; WHERE parent_id=0) t;&#39;;
<a name="l03189"></a>03189
<a name="l03190"></a>03190
<a name="l03191"></a>03191 EXECUTE &#39;SELECT max(array_upper(tree_location,1)) FROM &#39;||tree_table_name||&#39;;&#39;
<a name="l03192"></a>03192 INTO max_level;
<a name="l03193"></a>03193
<a name="l03194"></a>03194 IF( max_level is NULL ) THEN
<a name="l03195"></a>03195 RAISE EXCEPTION &#39;tree should not be empty&#39;;
<a name="l03196"></a>03196 END IF;
<a name="l03197"></a>03197
<a name="l03198"></a>03198 FOR curr_level IN 1..max_level LOOP
<a name="l03199"></a>03199 IF (verbosity &gt; 0) THEN
<a name="l03200"></a>03200 RAISE INFO &#39;new_depth: %&#39;, curr_level;
<a name="l03201"></a>03201 END IF;
<a name="l03202"></a>03202
<a name="l03203"></a>03203 table_pick = table_pick % 2 + 1;
<a name="l03204"></a>03204
<a name="l03205"></a>03205 EXECUTE &#39;TRUNCATE &#39;|| table_names[table_pick] ||&#39;;&#39;;
<a name="l03206"></a>03206 EXECUTE &#39;SELECT count(id) FROM &#39;||result_table_name||&#39;;&#39; INTO size_finished;
<a name="l03207"></a>03207
<a name="l03208"></a>03208 IF (verbosity &gt; 0) THEN
<a name="l03209"></a>03209 RAISE INFO &#39;size_finished %&#39;, size_finished;
<a name="l03210"></a>03210 END IF;
<a name="l03211"></a>03211
<a name="l03212"></a>03212 EXECUTE &#39;SELECT count(*) FROM &#39;|| table_names[(table_pick) % 2 + 1] ||&#39;;&#39;
<a name="l03213"></a>03213 INTO remains_to_classify;
<a name="l03214"></a>03214
<a name="l03215"></a>03215 IF (remains_to_classify = 0) THEN
<a name="l03216"></a>03216 IF (verbosity &gt; 0) THEN
<a name="l03217"></a>03217 RAISE INFO &#39;size_finished: % remains_to_classify: %&#39;,
<a name="l03218"></a>03218 size_finished, remains_to_classify;
<a name="l03219"></a>03219 END IF;
<a name="l03220"></a>03220
<a name="l03221"></a>03221 EXIT;
<a name="l03222"></a>03222 END IF;
<a name="l03223"></a>03223
<a name="l03224"></a>03224 SELECT MADLIB_SCHEMA.__format(
<a name="l03225"></a>03225 &#39;INSERT INTO %
<a name="l03226"></a>03226 SELECT pt.tid, pt.id,
<a name="l03227"></a>03227 CASE WHEN (is_cont) THEN
<a name="l03228"></a>03228 CASE WHEN (gt.lmc_nid IS NULL) THEN
<a name="l03229"></a>03229 0
<a name="l03230"></a>03230 ELSE
<a name="l03231"></a>03231 gt.lmc_nid +
<a name="l03232"></a>03232 float8lt(gt.split_value, fvals[gt.feature])::INT4 + 1 -
<a name="l03233"></a>03233 gt.lmc_fval
<a name="l03234"></a>03234 END
<a name="l03235"></a>03235 ELSE
<a name="l03236"></a>03236 CASE WHEN (gt.lmc_nid IS NULL) THEN
<a name="l03237"></a>03237 0
<a name="l03238"></a>03238 ELSE
<a name="l03239"></a>03239 gt.lmc_nid + fvals[gt.feature] - gt.lmc_fval
<a name="l03240"></a>03240 END
<a name="l03241"></a>03241 END as newjump,
<a name="l03242"></a>03242 gt.max_class, gt.probability, gt.parent_id, gt.id
<a name="l03243"></a>03243 FROM
<a name="l03244"></a>03244 (SELECT t1.tid, t1.id, t1.jump, fvals
<a name="l03245"></a>03245 FROM % t1
<a name="l03246"></a>03246 LEFT JOIN % t2
<a name="l03247"></a>03247 ON t1.id = t2.id) AS pt,
<a name="l03248"></a>03248 (SELECT tid,lmc_nid, lmc_fval, max_class,feature, probability,
<a name="l03249"></a>03249 parent_id, id, is_cont, split_value
<a name="l03250"></a>03250 FROM %
<a name="l03251"></a>03251 WHERE array_upper(tree_location,1) = %) AS gt
<a name="l03252"></a>03252 WHERE pt.jump = gt.id AND pt.tid=gt.tid;&#39;,
<a name="l03253"></a>03253 ARRAY[
<a name="l03254"></a>03254 table_names[table_pick],
<a name="l03255"></a>03255 table_names[(table_pick) % 2 + 1],
<a name="l03256"></a>03256 encoded_table_name,
<a name="l03257"></a>03257 tree_table_name,
<a name="l03258"></a>03258 MADLIB_SCHEMA.__to_char(curr_level)
<a name="l03259"></a>03259 ]
<a name="l03260"></a>03260 )
<a name="l03261"></a>03261 INTO curstmt;
<a name="l03262"></a>03262 EXECUTE curstmt;
<a name="l03263"></a>03263 <span class="comment">/*</span>
<a name="l03264"></a>03264 <span class="comment"> * if the node (whose id is &quot;jump&quot;) doesn&#39;t exist, </span>
<a name="l03265"></a>03265 <span class="comment"> * then insert them into result table </span>
<a name="l03266"></a>03266 <span class="comment"> * (be classified to max_class of its corrsponding node)</span>
<a name="l03267"></a>03267 <span class="comment"> */</span>
<a name="l03268"></a>03268 FOR tree_id IN EXECUTE &#39;SELECT DISTINCT tid FROM &#39;||tree_table_name LOOP
<a name="l03269"></a>03269 SELECT MADLIB_SCHEMA.__format(
<a name="l03270"></a>03270 &#39;INSERT INTO %(tid,id, jump, class, prob, parent_id, leaf_id)
<a name="l03271"></a>03271 SELECT tid,id, 0, class, prob, parent_id, leaf_id
<a name="l03272"></a>03272 FROM %
<a name="l03273"></a>03273 WHERE jump NOT IN (SELECT id FROM % WHERE tid=%)
<a name="l03274"></a>03274 AND tid=%&#39;,
<a name="l03275"></a>03275 ARRAY[
<a name="l03276"></a>03276 result_table_name,
<a name="l03277"></a>03277 table_names[table_pick],
<a name="l03278"></a>03278 tree_table_name,
<a name="l03279"></a>03279 MADLIB_SCHEMA.__to_char(tree_id),
<a name="l03280"></a>03280 MADLIB_SCHEMA.__to_char(tree_id)
<a name="l03281"></a>03281 ]
<a name="l03282"></a>03282 )
<a name="l03283"></a>03283 INTO curstmt;
<a name="l03284"></a>03284 EXECUTE curstmt;
<a name="l03285"></a>03285
<a name="l03286"></a>03286 -- delete from the being classified data table
<a name="l03287"></a>03287 SELECT MADLIB_SCHEMA.__format(
<a name="l03288"></a>03288 &#39;DELETE FROM %
<a name="l03289"></a>03289 WHERE jump NOT IN (SELECT id FROM % WHERE tid=%)
<a name="l03290"></a>03290 AND tid=%&#39;,
<a name="l03291"></a>03291 ARRAY[
<a name="l03292"></a>03292 table_names[table_pick],
<a name="l03293"></a>03293 tree_table_name,
<a name="l03294"></a>03294 MADLIB_SCHEMA.__to_char(tree_id),
<a name="l03295"></a>03295 MADLIB_SCHEMA.__to_char(tree_id)
<a name="l03296"></a>03296 ]
<a name="l03297"></a>03297 )
<a name="l03298"></a>03298 INTO curstmt;
<a name="l03299"></a>03299 EXECUTE curstmt;
<a name="l03300"></a>03300 END LOOP;
<a name="l03301"></a>03301 END LOOP;
<a name="l03302"></a>03302
<a name="l03303"></a>03303 EXECUTE &#39;INSERT INTO &#39;||result_table_name||&#39; SELECT * FROM &#39;||
<a name="l03304"></a>03304 table_names[table_pick] ||&#39; WHERE jump = 0;&#39;;
<a name="l03305"></a>03305 EXECUTE &#39;INSERT INTO &#39;||result_table_name||&#39; SELECT * FROM &#39;||
<a name="l03306"></a>03306 table_names[table_pick % 2 + 1] ||&#39; WHERE jump = 0;&#39;;
<a name="l03307"></a>03307
<a name="l03308"></a>03308 IF (verbosity &gt; 0) THEN
<a name="l03309"></a>03309 RAISE INFO &#39;final classification time:%&#39;, clock_timestamp() - time_stamp;
<a name="l03310"></a>03310 END IF;
<a name="l03311"></a>03311
<a name="l03312"></a>03312 RETURN ARRAY[encoded_table_name, result_table_name];
<a name="l03313"></a>03313 END
<a name="l03314"></a>03314 $$ LANGUAGE PLPGSQL;
<a name="l03315"></a>03315
<a name="l03316"></a>03316
<a name="l03317"></a>03317 <span class="comment">/*</span>
<a name="l03318"></a>03318 <span class="comment"> * @brief An internal classification function. It classifies with one tree </span>
<a name="l03319"></a>03319 <span class="comment"> * after another. For large data sets, tests shows that it is more</span>
<a name="l03320"></a>03320 <span class="comment"> * efficient than the parallel classification function. </span>
<a name="l03321"></a>03321 <span class="comment"> *</span>
<a name="l03322"></a>03322 <span class="comment"> * @param classification_table_name The full name of the table containing the </span>
<a name="l03323"></a>03323 <span class="comment"> * classification set.</span>
<a name="l03324"></a>03324 <span class="comment"> * @param tree_table_name The full name of the tree table.</span>
<a name="l03325"></a>03325 <span class="comment"> * @param verbosity &gt; 0 means this function runs in verbose mode. </span>
<a name="l03326"></a>03326 <span class="comment"> *</span>
<a name="l03327"></a>03327 <span class="comment"> * @return An array containing the encoded table name and classification result </span>
<a name="l03328"></a>03328 <span class="comment"> * table name (We encode the source table during the classification).</span>
<a name="l03329"></a>03329 <span class="comment"> *</span>
<a name="l03330"></a>03330 <span class="comment"> */</span>
<a name="l03331"></a>03331 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_classify_internal_serial
<a name="l03332"></a>03332 (
<a name="l03333"></a>03333 classification_table_name TEXT,
<a name="l03334"></a>03334 tree_table_name TEXT,
<a name="l03335"></a>03335 verbosity INT
<a name="l03336"></a>03336 )
<a name="l03337"></a>03337 RETURNS TEXT[] AS $$
<a name="l03338"></a>03338 DECLARE
<a name="l03339"></a>03339 table_pick INT := 1;
<a name="l03340"></a>03340 remains_to_classify INT;
<a name="l03341"></a>03341 size_finished INT;
<a name="l03342"></a>03342 time_stamp TIMESTAMP;
<a name="l03343"></a>03343 metatable_name TEXT := &#39;&#39;;
<a name="l03344"></a>03344 id_col_name TEXT := &#39;id&#39;;
<a name="l03345"></a>03345 curr_level INT := 1;
<a name="l03346"></a>03346 max_level INT := 0;
<a name="l03347"></a>03347 h2hmv_routine_id INT := 0;
<a name="l03348"></a>03348 curstmt TEXT := &#39;&#39;;
<a name="l03349"></a>03349 result_table_name TEXT := &#39;dt_classify_internal_rt&#39;;
<a name="l03350"></a>03350 encoded_table_name TEXT := &#39;dt_classify_internal_edt&#39;;
<a name="l03351"></a>03351 table_names TEXT[] := ARRAY[
<a name="l03352"></a>03352 &#39;classified_instance_ping&#39;,
<a name="l03353"></a>03353 &#39;classified_instance_pong&#39;
<a name="l03354"></a>03354 ];
<a name="l03355"></a>03355 tree_id INT;
<a name="l03356"></a>03356 root_id INT;
<a name="l03357"></a>03357 BEGIN
<a name="l03358"></a>03358 time_stamp = clock_timestamp();
<a name="l03359"></a>03359
<a name="l03360"></a>03360 PERFORM MADLIB_SCHEMA.__assert
<a name="l03361"></a>03361 (
<a name="l03362"></a>03362 (classification_table_name IS NOT NULL) AND
<a name="l03363"></a>03363 (
<a name="l03364"></a>03364 MADLIB_SCHEMA.__table_exists
<a name="l03365"></a>03365 (
<a name="l03366"></a>03366 classification_table_name
<a name="l03367"></a>03367 )
<a name="l03368"></a>03368 ),
<a name="l03369"></a>03369 &#39;the specified classification table&#39; ||
<a name="l03370"></a>03370 coalesce(&#39;&lt;&#39; ||
<a name="l03371"></a>03371 classification_table_name ||
<a name="l03372"></a>03372 &#39;&gt; does not exists&#39;, &#39; is NULL&#39;)
<a name="l03373"></a>03373 );
<a name="l03374"></a>03374
<a name="l03375"></a>03375 PERFORM MADLIB_SCHEMA.__assert
<a name="l03376"></a>03376 (
<a name="l03377"></a>03377 (tree_table_name IS NOT NULL) AND
<a name="l03378"></a>03378 (
<a name="l03379"></a>03379 MADLIB_SCHEMA.__table_exists
<a name="l03380"></a>03380 (
<a name="l03381"></a>03381 tree_table_name
<a name="l03382"></a>03382 )
<a name="l03383"></a>03383 ),
<a name="l03384"></a>03384 &#39;the specified tree table&#39; ||
<a name="l03385"></a>03385 coalesce(&#39;&lt;&#39; ||
<a name="l03386"></a>03386 tree_table_name ||
<a name="l03387"></a>03387 &#39;&gt; does not exists&#39;, &#39; is NULL&#39;)
<a name="l03388"></a>03388 );
<a name="l03389"></a>03389
<a name="l03390"></a>03390
<a name="l03391"></a>03391 PERFORM MADLIB_SCHEMA.__assert
<a name="l03392"></a>03392 (
<a name="l03393"></a>03393 verbosity IS NOT NULL,
<a name="l03394"></a>03394 &#39;verbosity must be non-null&#39;
<a name="l03395"></a>03395 );
<a name="l03396"></a>03396
<a name="l03397"></a>03397 EXECUTE &#39;DROP TABLE IF EXISTS &#39; || encoded_table_name || &#39; CASCADE&#39;;
<a name="l03398"></a>03398
<a name="l03399"></a>03399 metatable_name = MADLIB_SCHEMA.__get_metatable_name(tree_table_name);
<a name="l03400"></a>03400
<a name="l03401"></a>03401 h2hmv_routine_id = MADLIB_SCHEMA.__get_routine_id(tree_table_name);
<a name="l03402"></a>03402
<a name="l03403"></a>03403 PERFORM MADLIB_SCHEMA.__encode_table
<a name="l03404"></a>03404 (
<a name="l03405"></a>03405 classification_table_name,
<a name="l03406"></a>03406 encoded_table_name,
<a name="l03407"></a>03407 metatable_name,
<a name="l03408"></a>03408 h2hmv_routine_id,
<a name="l03409"></a>03409 verbosity
<a name="l03410"></a>03410 );
<a name="l03411"></a>03411
<a name="l03412"></a>03412 IF (verbosity &gt; 0) THEN
<a name="l03413"></a>03413 RAISE INFO &#39;tabular format. id_col_name: %&#39;, id_col_name;
<a name="l03414"></a>03414 END IF;
<a name="l03415"></a>03415
<a name="l03416"></a>03416 <span class="comment">/*</span>
<a name="l03417"></a>03417 <span class="comment"> * The table of classified_instance_ping and classified_instance_pong are</span>
<a name="l03418"></a>03418 <span class="comment"> * auxiliary tables used during the classification process.</span>
<a name="l03419"></a>03419 <span class="comment"> * For each record, these tables tell us which node it belongs to. They also</span>
<a name="l03420"></a>03420 <span class="comment"> * hold the information of class and probability.</span>
<a name="l03421"></a>03421 <span class="comment"> * We use transfer data between these two tables rather than update a single</span>
<a name="l03422"></a>03422 <span class="comment"> * table during the classification process. We find the operation of update</span>
<a name="l03423"></a>03423 <span class="comment"> * is quite expensive.</span>
<a name="l03424"></a>03424 <span class="comment"> */</span>
<a name="l03425"></a>03425 DROP TABLE IF EXISTS classified_instance_ping;
<a name="l03426"></a>03426 CREATE TEMP TABLE classified_instance_ping
<a name="l03427"></a>03427 (
<a name="l03428"></a>03428 id BIGINT,
<a name="l03429"></a>03429 jump INT,
<a name="l03430"></a>03430 class INT,
<a name="l03431"></a>03431 prob FLOAT,
<a name="l03432"></a>03432 parent_id INT,
<a name="l03433"></a>03433 leaf_id INT
<a name="l03434"></a>03434 ) m4_ifdef(`__GREENPLUM__&#39;, `DISTRIBUTED BY (id)&#39;);
<a name="l03435"></a>03435
<a name="l03436"></a>03436 DROP TABLE IF EXISTS classified_instance_pong;
<a name="l03437"></a>03437 CREATE TEMP TABLE classified_instance_pong
<a name="l03438"></a>03438 (
<a name="l03439"></a>03439 id BIGINT,
<a name="l03440"></a>03440 jump INT,
<a name="l03441"></a>03441 class INT,
<a name="l03442"></a>03442 prob FLOAT,
<a name="l03443"></a>03443 parent_id INT,
<a name="l03444"></a>03444 leaf_id INT
<a name="l03445"></a>03445 ) m4_ifdef(`__GREENPLUM__&#39;, `DISTRIBUTED BY (id)&#39;);
<a name="l03446"></a>03446
<a name="l03447"></a>03447
<a name="l03448"></a>03448 EXECUTE &#39;DROP TABLE IF EXISTS &#39;||result_table_name || &#39; CASCADE&#39;;
<a name="l03449"></a>03449 EXECUTE &#39;CREATE TEMP TABLE &#39; || result_table_name || E&#39;
<a name="l03450"></a>03450 (
<a name="l03451"></a>03451 tid INT,
<a name="l03452"></a>03452 id BIGINT,
<a name="l03453"></a>03453 jump INT,
<a name="l03454"></a>03454 class INT,
<a name="l03455"></a>03455 prob FLOAT,
<a name="l03456"></a>03456 parent_id INT,
<a name="l03457"></a>03457 leaf_id INT
<a name="l03458"></a>03458 ) m4_ifdef(`__GREENPLUM__&#39;, `DISTRIBUTED BY (id)&#39;);&#39;;
<a name="l03459"></a>03459
<a name="l03460"></a>03460 FOR tree_id IN EXECUTE &#39;SELECT DISTINCT tid FROM &#39;||tree_table_name LOOP
<a name="l03461"></a>03461 EXECUTE &#39;SELECT max(array_upper(tree_location,1)) FROM &#39;||
<a name="l03462"></a>03462 tree_table_name||&#39; WHERE tid=&#39;||tree_id||&#39;;&#39; INTO max_level;
<a name="l03463"></a>03463 IF (verbosity &gt; 0) THEN
<a name="l03464"></a>03464 RAISE INFO &#39;tree_id: %, max_level: %&#39;, tree_id,max_level;
<a name="l03465"></a>03465 END IF;
<a name="l03466"></a>03466
<a name="l03467"></a>03467
<a name="l03468"></a>03468 IF( max_level is NULL ) THEN
<a name="l03469"></a>03469 RAISE EXCEPTION &#39;tree should not be empty&#39;;
<a name="l03470"></a>03470 END IF;
<a name="l03471"></a>03471
<a name="l03472"></a>03472 TRUNCATE classified_instance_ping;
<a name="l03473"></a>03473 TRUNCATE classified_instance_pong;
<a name="l03474"></a>03474
<a name="l03475"></a>03475 EXECUTE &#39;SELECT id FROM &#39;||tree_table_name||
<a name="l03476"></a>03476 &#39; WHERE parent_id=0 and tid=&#39;||tree_id||&#39;;&#39; INTO root_id;
<a name="l03477"></a>03477 EXECUTE &#39;INSERT INTO classified_instance_ping (id, jump, class, prob)
<a name="l03478"></a>03478 SELECT &#39;||id_col_name||&#39;, &#39;||root_id||&#39;, 0, 0 FROM &#39; ||
<a name="l03479"></a>03479 encoded_table_name || &#39;;&#39;;
<a name="l03480"></a>03480 table_pick= 1;
<a name="l03481"></a>03481 FOR curr_level IN 1..max_level LOOP
<a name="l03482"></a>03482 IF (verbosity &gt; 0) THEN
<a name="l03483"></a>03483 RAISE INFO &#39;new_depth: %&#39;, curr_level;
<a name="l03484"></a>03484 END IF;
<a name="l03485"></a>03485
<a name="l03486"></a>03486 table_pick = table_pick % 2 + 1;
<a name="l03487"></a>03487
<a name="l03488"></a>03488 EXECUTE &#39;TRUNCATE &#39;|| table_names[table_pick] ||&#39;;&#39;;
<a name="l03489"></a>03489 EXECUTE &#39;SELECT count(id) FROM &#39;||result_table_name||&#39;;&#39;
<a name="l03490"></a>03490 INTO size_finished;
<a name="l03491"></a>03491
<a name="l03492"></a>03492 IF (verbosity &gt; 0) THEN
<a name="l03493"></a>03493 RAISE INFO &#39;size_finished %&#39;, size_finished;
<a name="l03494"></a>03494 END IF;
<a name="l03495"></a>03495
<a name="l03496"></a>03496 EXECUTE &#39;SELECT count(*) FROM &#39;||
<a name="l03497"></a>03497 table_names[(table_pick) % 2 + 1] ||&#39;;&#39;
<a name="l03498"></a>03498 INTO remains_to_classify;
<a name="l03499"></a>03499
<a name="l03500"></a>03500 IF (remains_to_classify = 0) THEN
<a name="l03501"></a>03501 IF (verbosity &gt; 0) THEN
<a name="l03502"></a>03502 RAISE INFO &#39;size_finished: % remains_to_classify: %&#39;,
<a name="l03503"></a>03503 size_finished, remains_to_classify;
<a name="l03504"></a>03504 END IF;
<a name="l03505"></a>03505
<a name="l03506"></a>03506 EXIT;
<a name="l03507"></a>03507 END IF;
<a name="l03508"></a>03508
<a name="l03509"></a>03509 SELECT MADLIB_SCHEMA.__format(
<a name="l03510"></a>03510 &#39;INSERT INTO %
<a name="l03511"></a>03511 SELECT pt.id,
<a name="l03512"></a>03512 CASE WHEN (is_cont) THEN
<a name="l03513"></a>03513 CASE WHEN (gt.lmc_nid IS NULL) THEN
<a name="l03514"></a>03514 0
<a name="l03515"></a>03515 ELSE
<a name="l03516"></a>03516 gt.lmc_nid +
<a name="l03517"></a>03517 float8lt(gt.split_value, fvals[gt.feature])::INT4
<a name="l03518"></a>03518 + 1 - gt.lmc_fval
<a name="l03519"></a>03519 END
<a name="l03520"></a>03520 ELSE
<a name="l03521"></a>03521 CASE WHEN (gt.lmc_nid IS NULL) THEN
<a name="l03522"></a>03522 0
<a name="l03523"></a>03523 ELSE
<a name="l03524"></a>03524 gt.lmc_nid + fvals[gt.feature] - gt.lmc_fval
<a name="l03525"></a>03525 END
<a name="l03526"></a>03526 END as newjump,
<a name="l03527"></a>03527 gt.max_class, gt.probability, gt.parent_id, gt.id
<a name="l03528"></a>03528 FROM
<a name="l03529"></a>03529 (
<a name="l03530"></a>03530 SELECT t1.id, t1.jump, fvals
<a name="l03531"></a>03531 FROM % t1
<a name="l03532"></a>03532 LEFT JOIN % t2
<a name="l03533"></a>03533 ON t1.id = t2.id
<a name="l03534"></a>03534 ) AS pt,
<a name="l03535"></a>03535 (
<a name="l03536"></a>03536 SELECT lmc_nid, lmc_fval, max_class, feature, probability,
<a name="l03537"></a>03537 parent_id, id, is_cont, split_value
<a name="l03538"></a>03538 FROM %
<a name="l03539"></a>03539 WHERE array_upper(tree_location,1) = % AND tid=%
<a name="l03540"></a>03540 ) AS gt
<a name="l03541"></a>03541 WHERE pt.jump = gt.id;&#39;,
<a name="l03542"></a>03542 ARRAY[
<a name="l03543"></a>03543 table_names[table_pick],
<a name="l03544"></a>03544 table_names[(table_pick) % 2 + 1],
<a name="l03545"></a>03545 encoded_table_name,
<a name="l03546"></a>03546 tree_table_name,
<a name="l03547"></a>03547 MADLIB_SCHEMA.__to_char(curr_level),
<a name="l03548"></a>03548 MADLIB_SCHEMA.__to_char(tree_id)
<a name="l03549"></a>03549 ]
<a name="l03550"></a>03550 )
<a name="l03551"></a>03551 INTO curstmt;
<a name="l03552"></a>03552 EXECUTE curstmt;
<a name="l03553"></a>03553
<a name="l03554"></a>03554 <span class="comment">/*</span>
<a name="l03555"></a>03555 <span class="comment"> * if the node (whose id is &quot;jump&quot;) doesn&#39;t exist, </span>
<a name="l03556"></a>03556 <span class="comment"> * then insert them into result table </span>
<a name="l03557"></a>03557 <span class="comment"> * (be classified to max_class of its corrsponding node)</span>
<a name="l03558"></a>03558 <span class="comment"> */</span>
<a name="l03559"></a>03559 SELECT MADLIB_SCHEMA.__format(
<a name="l03560"></a>03560 &#39;INSERT INTO %(tid,id, jump, class, prob, parent_id, leaf_id)
<a name="l03561"></a>03561 SELECT &#39;||tree_id||&#39;,id, 0, class, prob, parent_id, leaf_id
<a name="l03562"></a>03562 FROM %
<a name="l03563"></a>03563 WHERE jump NOT IN (SELECT id FROM % WHERE tid=%)&#39;,
<a name="l03564"></a>03564 ARRAY[
<a name="l03565"></a>03565 result_table_name,
<a name="l03566"></a>03566 table_names[table_pick],
<a name="l03567"></a>03567 tree_table_name,
<a name="l03568"></a>03568 MADLIB_SCHEMA.__to_char(tree_id)
<a name="l03569"></a>03569 ]
<a name="l03570"></a>03570 )
<a name="l03571"></a>03571 INTO curstmt;
<a name="l03572"></a>03572 EXECUTE curstmt;
<a name="l03573"></a>03573
<a name="l03574"></a>03574 -- delete from the being classified data table
<a name="l03575"></a>03575 SELECT MADLIB_SCHEMA.__format(
<a name="l03576"></a>03576 &#39;DELETE FROM %
<a name="l03577"></a>03577 WHERE jump NOT IN (SELECT id FROM % WHERE tid=%)&#39;,
<a name="l03578"></a>03578 ARRAY[
<a name="l03579"></a>03579 table_names[table_pick],
<a name="l03580"></a>03580 tree_table_name,
<a name="l03581"></a>03581 MADLIB_SCHEMA.__to_char(tree_id)
<a name="l03582"></a>03582 ]
<a name="l03583"></a>03583 )
<a name="l03584"></a>03584 INTO curstmt;
<a name="l03585"></a>03585 EXECUTE curstmt;
<a name="l03586"></a>03586 END LOOP;
<a name="l03587"></a>03587
<a name="l03588"></a>03588 EXECUTE &#39;INSERT INTO &#39;||result_table_name||&#39; SELECT &#39;||tree_id||&#39;,* FROM &#39;||
<a name="l03589"></a>03589 table_names[table_pick] ||&#39; WHERE jump = 0;&#39;;
<a name="l03590"></a>03590 EXECUTE &#39;INSERT INTO &#39;||result_table_name||&#39; SELECT &#39;||tree_id||&#39;,* FROM &#39;||
<a name="l03591"></a>03591 table_names[table_pick % 2 + 1] ||&#39; WHERE jump = 0;&#39;;
<a name="l03592"></a>03592 END LOOP;
<a name="l03593"></a>03593
<a name="l03594"></a>03594 IF (verbosity &gt; 0) THEN
<a name="l03595"></a>03595 RAISE INFO &#39;final classification time:%&#39;, clock_timestamp() - time_stamp;
<a name="l03596"></a>03596 END IF;
<a name="l03597"></a>03597
<a name="l03598"></a>03598 RETURN ARRAY[encoded_table_name, result_table_name];
<a name="l03599"></a>03599 END
<a name="l03600"></a>03600 $$ LANGUAGE PLPGSQL;
<a name="l03601"></a>03601
<a name="l03602"></a>03602
<a name="l03603"></a>03603 <span class="comment">/*</span>
<a name="l03604"></a>03604 <span class="comment"> * @brief This function check the accuracy of the trained tree model.</span>
<a name="l03605"></a>03605 <span class="comment"> * </span>
<a name="l03606"></a>03606 <span class="comment"> * @param tree_table_name The name of the tree containing the model.</span>
<a name="l03607"></a>03607 <span class="comment"> * @param scoring_table_name The full name of the table/view with the </span>
<a name="l03608"></a>03608 <span class="comment"> * data to be scored.</span>
<a name="l03609"></a>03609 <span class="comment"> * @param verbosity &gt; 0 means this function runs in verbose mode.</span>
<a name="l03610"></a>03610 <span class="comment"> *</span>
<a name="l03611"></a>03611 <span class="comment"> * @return The estimated accuracy information.</span>
<a name="l03612"></a>03612 <span class="comment"> *</span>
<a name="l03613"></a>03613 <span class="comment"> */</span>
<a name="l03614"></a>03614 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_score
<a name="l03615"></a>03615 (
<a name="l03616"></a>03616 tree_table_name TEXT,
<a name="l03617"></a>03617 scoring_table_name TEXT,
<a name="l03618"></a>03618 verbosity INT
<a name="l03619"></a>03619 )
<a name="l03620"></a>03620 RETURNS FLOAT AS $$
<a name="l03621"></a>03621 DECLARE
<a name="l03622"></a>03622 result_table_name TEXT;
<a name="l03623"></a>03623 result_table_name_final TEXT;
<a name="l03624"></a>03624 id_col_name TEXT := &#39;id&#39;;
<a name="l03625"></a>03625 class_col_name TEXT := &#39;class&#39;;
<a name="l03626"></a>03626 curstmt TEXT := &#39;&#39;;
<a name="l03627"></a>03627 num_of_row FLOAT := 0.0;
<a name="l03628"></a>03628 mis_of_row FLOAT := 0.0;
<a name="l03629"></a>03629 encoded_table_name TEXT := &#39;&#39;;
<a name="l03630"></a>03630 table_names TEXT[];
<a name="l03631"></a>03631 BEGIN
<a name="l03632"></a>03632
<a name="l03633"></a>03633 IF (verbosity &gt; 0) THEN
<a name="l03634"></a>03634 -- get rid of the messages whose severity level is lower than &#39;WARNING&#39;
<a name="l03635"></a>03635 SET client_min_messages = WARNING;
<a name="l03636"></a>03636 END IF;
<a name="l03637"></a>03637
<a name="l03638"></a>03638 PERFORM MADLIB_SCHEMA.__assert
<a name="l03639"></a>03639 (
<a name="l03640"></a>03640 (tree_table_name IS NOT NULL) AND
<a name="l03641"></a>03641 (
<a name="l03642"></a>03642 MADLIB_SCHEMA.__table_exists
<a name="l03643"></a>03643 (
<a name="l03644"></a>03644 tree_table_name
<a name="l03645"></a>03645 )
<a name="l03646"></a>03646 ),
<a name="l03647"></a>03647 &#39;the specified tree table&#39; || coalesce(&#39;&lt;&#39; || tree_table_name
<a name="l03648"></a>03648 || &#39;&gt; does not exist&#39;, &#39; is NULL&#39;)
<a name="l03649"></a>03649 );
<a name="l03650"></a>03650
<a name="l03651"></a>03651 PERFORM MADLIB_SCHEMA.__assert
<a name="l03652"></a>03652 (
<a name="l03653"></a>03653 (scoring_table_name IS NOT NULL) AND
<a name="l03654"></a>03654 (
<a name="l03655"></a>03655 MADLIB_SCHEMA.__table_exists
<a name="l03656"></a>03656 (
<a name="l03657"></a>03657 scoring_table_name
<a name="l03658"></a>03658 )
<a name="l03659"></a>03659 ),
<a name="l03660"></a>03660 &#39;the specified scoring table&#39; ||
<a name="l03661"></a>03661 coalesce(&#39;&lt;&#39; || scoring_table_name ||
<a name="l03662"></a>03662 &#39;&gt; does not exist&#39;, &#39; is NULL&#39;)
<a name="l03663"></a>03663 );
<a name="l03664"></a>03664
<a name="l03665"></a>03665 PERFORM MADLIB_SCHEMA.__assert
<a name="l03666"></a>03666 (
<a name="l03667"></a>03667 MADLIB_SCHEMA.__column_exists
<a name="l03668"></a>03668 (
<a name="l03669"></a>03669 scoring_table_name,
<a name="l03670"></a>03670 MADLIB_SCHEMA.__get_class_column_name
<a name="l03671"></a>03671 (
<a name="l03672"></a>03672 MADLIB_SCHEMA.__get_metatable_name(tree_table_name)
<a name="l03673"></a>03673 )
<a name="l03674"></a>03674 ),
<a name="l03675"></a>03675 &#39;the specified scoring table&lt;&#39; || scoring_table_name ||
<a name="l03676"></a>03676 &#39;&gt; does not have class column&#39;
<a name="l03677"></a>03677 );
<a name="l03678"></a>03678
<a name="l03679"></a>03679 table_names = MADLIB_SCHEMA.__treemodel_classify_internal
<a name="l03680"></a>03680 (
<a name="l03681"></a>03681 scoring_table_name,
<a name="l03682"></a>03682 tree_table_name,
<a name="l03683"></a>03683 verbosity
<a name="l03684"></a>03684 );
<a name="l03685"></a>03685 encoded_table_name = table_names[1];
<a name="l03686"></a>03686 result_table_name = table_names[2];
<a name="l03687"></a>03687 result_table_name_final = result_table_name||&#39;_final&#39;;
<a name="l03688"></a>03688
<a name="l03689"></a>03689 PERFORM MADLIB_SCHEMA.__treemodel_get_vote_result
<a name="l03690"></a>03690 (
<a name="l03691"></a>03691 result_table_name,
<a name="l03692"></a>03692 result_table_name_final
<a name="l03693"></a>03693 );
<a name="l03694"></a>03694
<a name="l03695"></a>03695 SELECT MADLIB_SCHEMA.__format
<a name="l03696"></a>03696 (
<a name="l03697"></a>03697 &#39;SELECT count(id) FROM %;&#39;,
<a name="l03698"></a>03698 result_table_name_final
<a name="l03699"></a>03699 )
<a name="l03700"></a>03700 INTO curstmt;
<a name="l03701"></a>03701
<a name="l03702"></a>03702 EXECUTE curstmt INTO num_of_row;
<a name="l03703"></a>03703
<a name="l03704"></a>03704 SELECT MADLIB_SCHEMA.__format
<a name="l03705"></a>03705 (
<a name="l03706"></a>03706 &#39;SELECT count(t2.id)
<a name="l03707"></a>03707 FROM % t1, % t2
<a name="l03708"></a>03708 WHERE t1.% = t2.id AND t1.% &lt;&gt; t2.class&#39;,
<a name="l03709"></a>03709 ARRAY[
<a name="l03710"></a>03710 encoded_table_name,
<a name="l03711"></a>03711 result_table_name_final,
<a name="l03712"></a>03712 id_col_name,
<a name="l03713"></a>03713 class_col_name
<a name="l03714"></a>03714 ]
<a name="l03715"></a>03715 )
<a name="l03716"></a>03716 INTO curstmt;
<a name="l03717"></a>03717
<a name="l03718"></a>03718 EXECUTE curstmt INTO mis_of_row;
<a name="l03719"></a>03719
<a name="l03720"></a>03720 EXECUTE &#39;DROP TABLE IF EXISTS &#39; || encoded_table_name || &#39;;&#39;;
<a name="l03721"></a>03721 EXECUTE &#39;DROP TABLE IF EXISTS &#39; || result_table_name || &#39;;&#39;;
<a name="l03722"></a>03722 EXECUTE &#39;DROP TABLE IF EXISTS &#39; || result_table_name_final || &#39;;&#39;;
<a name="l03723"></a>03723 RETURN (num_of_row - mis_of_row) / num_of_row;
<a name="l03724"></a>03724 END;
<a name="l03725"></a>03725 $$ LANGUAGE PLPGSQL;
<a name="l03726"></a>03726
<a name="l03727"></a>03727
<a name="l03728"></a>03728 <span class="comment">/*</span>
<a name="l03729"></a>03729 <span class="comment"> * @brief Cleanup the trained model table and any relevant tables.</span>
<a name="l03730"></a>03730 <span class="comment"> *</span>
<a name="l03731"></a>03731 <span class="comment"> * @param model_table_name The name of the table containing</span>
<a name="l03732"></a>03732 <span class="comment"> * the model&#39;s information.</span>
<a name="l03733"></a>03733 <span class="comment"> *</span>
<a name="l03734"></a>03734 <span class="comment"> * @return The status of that cleanup operation.</span>
<a name="l03735"></a>03735 <span class="comment"> *</span>
<a name="l03736"></a>03736 <span class="comment"> */</span>
<a name="l03737"></a>03737 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_clean
<a name="l03738"></a>03738 (
<a name="l03739"></a>03739 model_table_name TEXT
<a name="l03740"></a>03740 )
<a name="l03741"></a>03741 RETURNS BOOLEAN AS $$
<a name="l03742"></a>03742 DECLARE
<a name="l03743"></a>03743 metatable_name TEXT;
<a name="l03744"></a>03744 ref_count INT;
<a name="l03745"></a>03745 BEGIN
<a name="l03746"></a>03746 -- get rid of the messages whose severity level is lower than &#39;WARNING&#39;
<a name="l03747"></a>03747 SET client_min_messages = WARNING;
<a name="l03748"></a>03748
<a name="l03749"></a>03749 PERFORM MADLIB_SCHEMA.__assert
<a name="l03750"></a>03750 (
<a name="l03751"></a>03751 (model_table_name IS NOT NULL) AND
<a name="l03752"></a>03752 (
<a name="l03753"></a>03753 MADLIB_SCHEMA.__table_exists
<a name="l03754"></a>03754 (
<a name="l03755"></a>03755 model_table_name
<a name="l03756"></a>03756 )
<a name="l03757"></a>03757 ),
<a name="l03758"></a>03758 &#39;the specified tree table&#39; ||
<a name="l03759"></a>03759 coalesce(&#39;&lt;&#39; ||
<a name="l03760"></a>03760 model_table_name ||
<a name="l03761"></a>03761 &#39;&gt; does not exists&#39;, &#39; is NULL&#39;)
<a name="l03762"></a>03762 );
<a name="l03763"></a>03763
<a name="l03764"></a>03764 IF (MADLIB_SCHEMA.__table_exists(&#39;MADLIB_SCHEMA.training_info&#39;)) THEN
<a name="l03765"></a>03765 metatable_name = MADLIB_SCHEMA.__get_metatable_name(model_table_name);
<a name="l03766"></a>03766 IF( metatable_name IS NOT NULL) THEN
<a name="l03767"></a>03767 SELECT count(*)
<a name="l03768"></a>03768 FROM MADLIB_SCHEMA.training_info
<a name="l03769"></a>03769 WHERE training_metatable_oid = metatable_name::regclass
<a name="l03770"></a>03770 INTO ref_count;
<a name="l03771"></a>03771
<a name="l03772"></a>03772 -- if the metatable is not referenced by other training procedure.
<a name="l03773"></a>03773 IF (ref_count = 1) THEN
<a name="l03774"></a>03774 PERFORM MADLIB_SCHEMA.__drop_metatable(metatable_name);
<a name="l03775"></a>03775 EXECUTE &#39;DROP TABLE IF EXISTS &#39; ||
<a name="l03776"></a>03776 MADLIB_SCHEMA.__get_encode_table_name(model_table_name) || &#39;;&#39;;
<a name="l03777"></a>03777 END IF;
<a name="l03778"></a>03778 END IF;
<a name="l03779"></a>03779
<a name="l03780"></a>03780 -- remove the record first, and then drop the table
<a name="l03781"></a>03781 PERFORM MADLIB_SCHEMA.__delete_traininginfo(model_table_name);
<a name="l03782"></a>03782 EXECUTE &#39;DROP TABLE IF EXISTS &#39; || model_table_name;
<a name="l03783"></a>03783
<a name="l03784"></a>03784 ELSE
<a name="l03785"></a>03785 EXECUTE &#39;DROP TABLE IF EXISTS &#39; || model_table_name;
<a name="l03786"></a>03786 END IF;
<a name="l03787"></a>03787
<a name="l03788"></a>03788 RETURN &#39;t&#39;;
<a name="l03789"></a>03789 END
<a name="l03790"></a>03790 $$ LANGUAGE PLPGSQL;
<a name="l03791"></a>03791
<a name="l03792"></a>03792 <span class="comment">/*</span>
<a name="l03793"></a>03793 <span class="comment"> * @brief Validate the common parameters for C4.5 and RF API.</span>
<a name="l03794"></a>03794 <span class="comment"> *</span>
<a name="l03795"></a>03795 <span class="comment"> * @param split_criterion The name of the split criterion that should be used </span>
<a name="l03796"></a>03796 <span class="comment"> * for tree construction. The valid values are</span>
<a name="l03797"></a>03797 <span class="comment"> * ‘infogain’, ‘gainratio’, and ‘gini’. It can&#39;t be NULL.</span>
<a name="l03798"></a>03798 <span class="comment"> * @param training_table_name The name of the table/view with the source data.</span>
<a name="l03799"></a>03799 <span class="comment"> * @param result_table_name The name of the table where the resulting DT </span>
<a name="l03800"></a>03800 <span class="comment"> * will be kept.</span>
<a name="l03801"></a>03801 <span class="comment"> * @param continuous_feature_names A comma-separated list of the names of features whose values </span>
<a name="l03802"></a>03802 <span class="comment"> * are continuous. The default is null, which means there are </span>
<a name="l03803"></a>03803 <span class="comment"> * no continuous features in the training table.</span>
<a name="l03804"></a>03804 <span class="comment"> * @param feature_col_names A comma-separated list of the names of table columns, each of</span>
<a name="l03805"></a>03805 <span class="comment"> * which defines a feature. The default value is null, which means </span>
<a name="l03806"></a>03806 <span class="comment"> * all the columns in the training table, except columns named </span>
<a name="l03807"></a>03807 <span class="comment"> * ‘id’ and ‘class’, will be used as features.</span>
<a name="l03808"></a>03808 <span class="comment"> * @param id_col_name The name of the column containing an ID for each record.</span>
<a name="l03809"></a>03809 <span class="comment"> * @param class_col_name The name of the column containing the labeled class. </span>
<a name="l03810"></a>03810 <span class="comment"> * @param how2handle_missing_value The way to handle missing value. The valid value </span>
<a name="l03811"></a>03811 <span class="comment"> * is &#39;explicit&#39; or &#39;ignore&#39;.</span>
<a name="l03812"></a>03812 <span class="comment"> * @param max_tree_depth Specifies the maximum number of levels in the result DT </span>
<a name="l03813"></a>03813 <span class="comment"> * to avoid overgrown DTs. </span>
<a name="l03814"></a>03814 <span class="comment"> * @param node_prune_threshold The minimum percentage of the number of records required in a</span>
<a name="l03815"></a>03815 <span class="comment"> * child node. It can&#39;t be NULL. The range of it is in [0.0, 1.0].</span>
<a name="l03816"></a>03816 <span class="comment"> * This threshold only applies to the non-root nodes. Therefore,</span>
<a name="l03817"></a>03817 <span class="comment"> * if its value is 1, then the trained tree only has one node (the root node);</span>
<a name="l03818"></a>03818 <span class="comment"> * if its value is 0, then no nodes will be pruned by this parameter.</span>
<a name="l03819"></a>03819 <span class="comment"> * @param node_split_threshold The minimum percentage of the number of records required in a</span>
<a name="l03820"></a>03820 <span class="comment"> * node in order for a further split to be possible.</span>
<a name="l03821"></a>03821 <span class="comment"> * It can&#39;t be NULL. The range of it is in [0.0, 1.0].</span>
<a name="l03822"></a>03822 <span class="comment"> * If it&#39;s value is 1, then the trained tree only has two levels, since</span>
<a name="l03823"></a>03823 <span class="comment"> * only the root node can grow; if its value is 0, then trees can grow</span>
<a name="l03824"></a>03824 <span class="comment"> * extensively.</span>
<a name="l03825"></a>03825 <span class="comment"> * @param verbosity &gt; 0 means this function runs in verbose mode. </span>
<a name="l03826"></a>03826 <span class="comment"> * @param error_msg The reported error message when result_table_name is invalid.</span>
<a name="l03827"></a>03827 <span class="comment"> *</span>
<a name="l03828"></a>03828 <span class="comment"> */</span>
<a name="l03829"></a>03829 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__check_dt_common_params
<a name="l03830"></a>03830 (
<a name="l03831"></a>03831 split_criterion TEXT,
<a name="l03832"></a>03832 training_table_name TEXT,
<a name="l03833"></a>03833 result_table_name TEXT,
<a name="l03834"></a>03834 continuous_feature_names TEXT,
<a name="l03835"></a>03835 feature_col_names TEXT,
<a name="l03836"></a>03836 id_col_name TEXT,
<a name="l03837"></a>03837 class_col_name TEXT,
<a name="l03838"></a>03838 how2handle_missing_value TEXT,
<a name="l03839"></a>03839 max_tree_depth INT,
<a name="l03840"></a>03840 node_prune_threshold FLOAT,
<a name="l03841"></a>03841 node_split_threshold FLOAT,
<a name="l03842"></a>03842 verbosity INT,
<a name="l03843"></a>03843 error_msg TEXT
<a name="l03844"></a>03844 )
<a name="l03845"></a>03845 RETURNS void AS $$
<a name="l03846"></a>03846 DECLARE
<a name="l03847"></a>03847 num_of_element BIGINT;
<a name="l03848"></a>03848 BEGIN
<a name="l03849"></a>03849 PERFORM MADLIB_SCHEMA.__assert
<a name="l03850"></a>03850 (
<a name="l03851"></a>03851 (split_criterion IS NOT NULL) AND
<a name="l03852"></a>03852 (
<a name="l03853"></a>03853 split_criterion = &#39;infogain&#39; OR
<a name="l03854"></a>03854 split_criterion = &#39;gainratio&#39; OR
<a name="l03855"></a>03855 split_criterion = &#39;gini&#39;
<a name="l03856"></a>03856 ),
<a name="l03857"></a>03857 &#39;split_criterion must be infogain, gainratio or gini&#39;
<a name="l03858"></a>03858 );
<a name="l03859"></a>03859
<a name="l03860"></a>03860 PERFORM MADLIB_SCHEMA.__assert
<a name="l03861"></a>03861 (
<a name="l03862"></a>03862 how2handle_missing_value = &#39;ignore&#39; OR
<a name="l03863"></a>03863 how2handle_missing_value = &#39;explicit&#39;,
<a name="l03864"></a>03864 &#39;how2handle_missing_value must be ignore or explicit!&#39;
<a name="l03865"></a>03865 );
<a name="l03866"></a>03866
<a name="l03867"></a>03867 PERFORM MADLIB_SCHEMA.__assert
<a name="l03868"></a>03868 (
<a name="l03869"></a>03869 max_tree_depth IS NOT NULL AND
<a name="l03870"></a>03870 max_tree_depth &gt; 0,
<a name="l03871"></a>03871 &#39;max_tree_depth value must be greater than 0&#39;
<a name="l03872"></a>03872 );
<a name="l03873"></a>03873
<a name="l03874"></a>03874 PERFORM MADLIB_SCHEMA.__assert
<a name="l03875"></a>03875 (
<a name="l03876"></a>03876 node_prune_threshold IS NOT NULL AND
<a name="l03877"></a>03877 float8ge(node_prune_threshold, 0) AND
<a name="l03878"></a>03878 float8le(node_prune_threshold, 1),
<a name="l03879"></a>03879 &#39;node_prune_threshold value must be in range from 0 to 1&#39;
<a name="l03880"></a>03880 );
<a name="l03881"></a>03881
<a name="l03882"></a>03882 PERFORM MADLIB_SCHEMA.__assert
<a name="l03883"></a>03883 (
<a name="l03884"></a>03884 node_split_threshold IS NOT NULL AND
<a name="l03885"></a>03885 float8ge(node_split_threshold, 0) AND
<a name="l03886"></a>03886 float8le(node_split_threshold, 1),
<a name="l03887"></a>03887 &#39;node_split_threshold value must be in range from 0 to 1&#39;
<a name="l03888"></a>03888 );
<a name="l03889"></a>03889
<a name="l03890"></a>03890 PERFORM MADLIB_SCHEMA.__assert
<a name="l03891"></a>03891 (
<a name="l03892"></a>03892 verbosity IS NOT NULL,
<a name="l03893"></a>03893 &#39;verbosity must be non-null&#39;
<a name="l03894"></a>03894 );
<a name="l03895"></a>03895
<a name="l03896"></a>03896 PERFORM MADLIB_SCHEMA.__assert
<a name="l03897"></a>03897 (
<a name="l03898"></a>03898 id_col_name IS NOT NULL AND
<a name="l03899"></a>03899 class_col_name IS NOT NULL AND
<a name="l03900"></a>03900 length(btrim(id_col_name, &#39; &#39;)) &gt; 0 AND
<a name="l03901"></a>03901 length(btrim(class_col_name, &#39; &#39;)) &gt; 0,
<a name="l03902"></a>03902 &#39;invalid id column name or class column name&#39;
<a name="l03903"></a>03903 );
<a name="l03904"></a>03904
<a name="l03905"></a>03905 PERFORM MADLIB_SCHEMA.__assert
<a name="l03906"></a>03906 (
<a name="l03907"></a>03907 training_table_name IS NOT NULL AND
<a name="l03908"></a>03908 MADLIB_SCHEMA.__table_exists
<a name="l03909"></a>03909 (
<a name="l03910"></a>03910 training_table_name
<a name="l03911"></a>03911 ),
<a name="l03912"></a>03912 &#39;the specified training table&#39; ||
<a name="l03913"></a>03913 coalesce(&#39;&lt;&#39; ||
<a name="l03914"></a>03914 training_table_name ||
<a name="l03915"></a>03915 &#39;&gt; does not exist&#39;, &#39; is NULL&#39;)
<a name="l03916"></a>03916 );
<a name="l03917"></a>03917
<a name="l03918"></a>03918 EXECUTE &#39;SELECT count(*) FROM
<a name="l03919"></a>03919 (SELECT * FROM &#39;||training_table_name||&#39; LIMIT 1) l&#39;
<a name="l03920"></a>03920 INTO num_of_element;
<a name="l03921"></a>03921
<a name="l03922"></a>03922 PERFORM MADLIB_SCHEMA.__assert
<a name="l03923"></a>03923 (
<a name="l03924"></a>03924 num_of_element &gt; 0,
<a name="l03925"></a>03925 &#39;the specified training table &lt;&#39;||training_table_name||
<a name="l03926"></a>03926 &#39;&gt; should not be empty&#39;
<a name="l03927"></a>03927 );
<a name="l03928"></a>03928
<a name="l03929"></a>03929
<a name="l03930"></a>03930 PERFORM MADLIB_SCHEMA.__assert
<a name="l03931"></a>03931 (
<a name="l03932"></a>03932 result_table_name IS NOT NULL,
<a name="l03933"></a>03933 &#39;the specified result &#39; || error_msg || &#39; table name is NULL&#39;
<a name="l03934"></a>03934 );
<a name="l03935"></a>03935
<a name="l03936"></a>03936 PERFORM MADLIB_SCHEMA.__assert
<a name="l03937"></a>03937 (
<a name="l03938"></a>03938 NOT MADLIB_SCHEMA.__table_exists
<a name="l03939"></a>03939 (
<a name="l03940"></a>03940 result_table_name
<a name="l03941"></a>03941 )
<a name="l03942"></a>03942 ,
<a name="l03943"></a>03943 &#39;the specified result &#39; || error_msg || &#39; table&lt;&#39; ||
<a name="l03944"></a>03944 result_table_name ||
<a name="l03945"></a>03945 &#39;&gt; exists&#39;
<a name="l03946"></a>03946 );
<a name="l03947"></a>03947 END
<a name="l03948"></a>03948 $$ LANGUAGE PLPGSQL STABLE;
<a name="l03949"></a>03949
<a name="l03950"></a>03950
<a name="l03951"></a>03951 <span class="comment">/*</span>
<a name="l03952"></a>03952 <span class="comment"> * @brief Get the name of the encoded table and the name of</span>
<a name="l03953"></a>03953 <span class="comment"> * its meta table.</span>
<a name="l03954"></a>03954 <span class="comment"> * @param result_table_name The name of the table where the </span>
<a name="l03955"></a>03955 <span class="comment"> * resulting DT will be kept </span>
<a name="l03956"></a>03956 <span class="comment"> * @param error_msg The reported error message when the</span>
<a name="l03957"></a>03957 <span class="comment"> * length of result schema name plus</span>
<a name="l03958"></a>03958 <span class="comment"> * the length of result table name is</span>
<a name="l03959"></a>03959 <span class="comment"> * larger than 58.</span>
<a name="l03960"></a>03960 <span class="comment"> * </span>
<a name="l03961"></a>03961 <span class="comment"> * @return A text array that contains two elements. The firest element</span>
<a name="l03962"></a>03962 <span class="comment"> * is the encoded table name and the second is the meta table name.</span>
<a name="l03963"></a>03963 <span class="comment"> * </span>
<a name="l03964"></a>03964 <span class="comment"> */</span>
<a name="l03965"></a>03965 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__gen_enc_meta_names
<a name="l03966"></a>03966 (
<a name="l03967"></a>03967 result_table_name TEXT,
<a name="l03968"></a>03968 error_msg TEXT
<a name="l03969"></a>03969 )
<a name="l03970"></a>03970 RETURNS TEXT[] AS $$
<a name="l03971"></a>03971 DECLARE
<a name="l03972"></a>03972 result_schema_name TEXT;
<a name="l03973"></a>03973 table_names TEXT[];
<a name="l03974"></a>03974 BEGIN
<a name="l03975"></a>03975 result_schema_name = MADLIB_SCHEMA.__get_schema_name(result_table_name);
<a name="l03976"></a>03976
<a name="l03977"></a>03977 -- the maximum length of an identifier 63
<a name="l03978"></a>03978 -- encoding table name convension: &lt;schema name&gt;_&lt;table name&gt;_ed
<a name="l03979"></a>03979 -- data info table name convension: &lt;schema name&gt;_&lt;table name&gt;_di
<a name="l03980"></a>03980 -- the KV table name convension: &lt;schema name&gt;_&lt;table name&gt;_&lt;####&gt;
<a name="l03981"></a>03981 -- therefore, the maximum length of &#39;&lt;schema name&gt;_&lt;table name&gt;&#39; is 58
<a name="l03982"></a>03982 PERFORM MADLIB_SCHEMA.__assert
<a name="l03983"></a>03983 (
<a name="l03984"></a>03984 length(
<a name="l03985"></a>03985 result_schema_name ||
<a name="l03986"></a>03986 &#39;_&#39; ||
<a name="l03987"></a>03987 result_table_name) &lt;= 58,
<a name="l03988"></a>03988 &#39;the maximum length of &#39;&#39;&#39; || error_msg || &#39;&#39;&#39; is 58&#39;
<a name="l03989"></a>03989 );
<a name="l03990"></a>03990
<a name="l03991"></a>03991 -- the encoded table and meta table will be under the specified schema
<a name="l03992"></a>03992 table_names[1] = result_schema_name ||
<a name="l03993"></a>03993 &#39;.&#39; ||
<a name="l03994"></a>03994 replace(result_table_name, &#39;.&#39;, &#39;_&#39;) ||
<a name="l03995"></a>03995 &#39;_ed&#39;;
<a name="l03996"></a>03996 table_names[2] = result_schema_name ||
<a name="l03997"></a>03997 &#39;.&#39; ||
<a name="l03998"></a>03998 replace(result_table_name, &#39;.&#39;, &#39;_&#39;) ||
<a name="l03999"></a>03999 &#39;_di&#39;;
<a name="l04000"></a>04000 RETURN table_names;
<a name="l04001"></a>04001 END
<a name="l04002"></a>04002 $$ LANGUAGE PLPGSQL STABLE;
<a name="l04003"></a>04003
<a name="l04004"></a>04004
<a name="l04005"></a>04005 <span class="comment">/*</span>
<a name="l04006"></a>04006 <span class="comment"> * @brief Validate if the provided columns are in the training table or not.</span>
<a name="l04007"></a>04007 <span class="comment"> *</span>
<a name="l04008"></a>04008 <span class="comment"> * @param training_table_name The name of the table/view with the source data.</span>
<a name="l04009"></a>04009 <span class="comment"> * @param continuous_feature_names A text array that contains all the continuous </span>
<a name="l04010"></a>04010 <span class="comment"> * features&#39; names. </span>
<a name="l04011"></a>04011 <span class="comment"> * @param feature_col_names A text array that contains all the features&#39; names.</span>
<a name="l04012"></a>04012 <span class="comment"> * @param id_col_name The name of the column containing an ID for each record.</span>
<a name="l04013"></a>04013 <span class="comment"> * @param class_col_name The name of the column containing the labeled class. </span>
<a name="l04014"></a>04014 <span class="comment"> * @param features_per_node The number of features to be considered when finding </span>
<a name="l04015"></a>04015 <span class="comment"> * a best split.</span>
<a name="l04016"></a>04016 <span class="comment"> */</span>
<a name="l04017"></a>04017 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__check_training_table
<a name="l04018"></a>04018 (
<a name="l04019"></a>04019 training_table_name TEXT,
<a name="l04020"></a>04020 continuous_feature_names TEXT[],
<a name="l04021"></a>04021 feature_col_names TEXT[],
<a name="l04022"></a>04022 id_col_name TEXT,
<a name="l04023"></a>04023 class_col_name TEXT,
<a name="l04024"></a>04024 features_per_node INT
<a name="l04025"></a>04025 )
<a name="l04026"></a>04026 RETURNS VOID AS $$
<a name="l04027"></a>04027 DECLARE
<a name="l04028"></a>04028 num_attrs INT;
<a name="l04029"></a>04029 BEGIN
<a name="l04030"></a>04030 PERFORM MADLIB_SCHEMA.__assert
<a name="l04031"></a>04031 (
<a name="l04032"></a>04032 MADLIB_SCHEMA.__column_exists
<a name="l04033"></a>04033 (
<a name="l04034"></a>04034 training_table_name,
<a name="l04035"></a>04035 lower(btrim(id_col_name, &#39; &#39;))
<a name="l04036"></a>04036 ),
<a name="l04037"></a>04037 &#39;the specified training table&lt;&#39; ||
<a name="l04038"></a>04038 training_table_name ||
<a name="l04039"></a>04039 &#39;&gt; does not have column &#39;&#39;&#39; ||
<a name="l04040"></a>04040 id_col_name ||
<a name="l04041"></a>04041 &#39;&#39;&#39;&#39;
<a name="l04042"></a>04042 );
<a name="l04043"></a>04043
<a name="l04044"></a>04044 PERFORM MADLIB_SCHEMA.__assert
<a name="l04045"></a>04045 (
<a name="l04046"></a>04046 MADLIB_SCHEMA.__column_exists
<a name="l04047"></a>04047 (
<a name="l04048"></a>04048 training_table_name,
<a name="l04049"></a>04049 lower(btrim(class_col_name, &#39; &#39;))
<a name="l04050"></a>04050 ),
<a name="l04051"></a>04051 &#39;the specified training table&lt;&#39; ||
<a name="l04052"></a>04052 training_table_name ||
<a name="l04053"></a>04053 &#39;&gt; does not have column &#39;&#39;&#39; ||
<a name="l04054"></a>04054 class_col_name ||
<a name="l04055"></a>04055 &#39;&#39;&#39;&#39;
<a name="l04056"></a>04056 );
<a name="l04057"></a>04057
<a name="l04058"></a>04058 IF (feature_col_names IS NULL) THEN
<a name="l04059"></a>04059 -- 2 means the id and class column
<a name="l04060"></a>04060 num_attrs = MADLIB_SCHEMA.__num_of_columns(training_table_name) - 2;
<a name="l04061"></a>04061
<a name="l04062"></a>04062 PERFORM MADLIB_SCHEMA.__assert
<a name="l04063"></a>04063 (
<a name="l04064"></a>04064 (features_per_node IS NULL AND num_attrs &gt; 0) OR
<a name="l04065"></a>04065 (features_per_node IS NOT NULL AND num_attrs &gt;= features_per_node),
<a name="l04066"></a>04066 &#39;the value of features_per_node must be less than or equal to the total number &#39; ||
<a name="l04067"></a>04067 &#39;of features of the training table&#39;
<a name="l04068"></a>04068 );
<a name="l04069"></a>04069 PERFORM MADLIB_SCHEMA.__assert
<a name="l04070"></a>04070 (
<a name="l04071"></a>04071 MADLIB_SCHEMA.__columns_in_table(continuous_feature_names, training_table_name),
<a name="l04072"></a>04072 &#39;each feature in continuous_feature_names must be a column of the training table&#39;
<a name="l04073"></a>04073 );
<a name="l04074"></a>04074 ELSE
<a name="l04075"></a>04075 num_attrs = array_upper(feature_col_names, 1);
<a name="l04076"></a>04076 PERFORM MADLIB_SCHEMA.__assert
<a name="l04077"></a>04077 (
<a name="l04078"></a>04078 (features_per_node IS NULL AND num_attrs &gt; 0) OR
<a name="l04079"></a>04079 (features_per_node IS NOT NULL AND num_attrs &gt;= features_per_node),
<a name="l04080"></a>04080 &#39;the value of features_per_node must be less than or equal to the total number &#39; ||
<a name="l04081"></a>04081 &#39;of features of the training table&#39;
<a name="l04082"></a>04082 );
<a name="l04083"></a>04083 PERFORM MADLIB_SCHEMA.__assert
<a name="l04084"></a>04084 (
<a name="l04085"></a>04085 MADLIB_SCHEMA.__columns_in_table(feature_col_names, training_table_name),
<a name="l04086"></a>04086 &#39;each feature in feature_col_names must be a column of the training table&#39;
<a name="l04087"></a>04087 );
<a name="l04088"></a>04088
<a name="l04089"></a>04089 PERFORM MADLIB_SCHEMA.__assert
<a name="l04090"></a>04090 (
<a name="l04091"></a>04091 coalesce(continuous_feature_names, &#39;{}&#39;::TEXT[]) &lt;@ feature_col_names,
<a name="l04092"></a>04092 &#39;each feature in continuous_feature_names must be in the feature_col_names&#39;
<a name="l04093"></a>04093 );
<a name="l04094"></a>04094 END IF;
<a name="l04095"></a>04095 END
<a name="l04096"></a>04096 $$ LANGUAGE PLPGSQL STABLE;
<a name="l04097"></a>04097
<a name="l04098"></a>04098
<a name="l04099"></a>04099 <span class="comment">/* @ brief If the training table is a valid encoded table, then we use it directly.</span>
<a name="l04100"></a>04100 <span class="comment"> * If the training table is not encoded, then we invoke the encoding procedure</span>
<a name="l04101"></a>04101 <span class="comment"> * to transform the training table. </span>
<a name="l04102"></a>04102 <span class="comment"> * With the encoded table, we call the tree grow engine to generate the final tree.</span>
<a name="l04103"></a>04103 <span class="comment"> *</span>
<a name="l04104"></a>04104 <span class="comment"> * @param dt_algo_name The name of the algorithom. Currently, it&#39;s</span>
<a name="l04105"></a>04105 <span class="comment"> * &#39;C4.5&#39; or &#39;RF&#39;</span>
<a name="l04106"></a>04106 <span class="comment"> * @param split_criterion This parameter specifies which split criterion </span>
<a name="l04107"></a>04107 <span class="comment"> * should be used for tree construction and </span>
<a name="l04108"></a>04108 <span class="comment"> * pruning. The valid values are infogain, </span>
<a name="l04109"></a>04109 <span class="comment"> * gainratio, and gini.</span>
<a name="l04110"></a>04110 <span class="comment"> * @param num_trees Total number of trees to be trained. </span>
<a name="l04111"></a>04111 <span class="comment"> * @param features_per_node Total number of features used to compute split </span>
<a name="l04112"></a>04112 <span class="comment"> * gain for each node. </span>
<a name="l04113"></a>04113 <span class="comment"> * @param training_table_name The name of the table/view with the source data. </span>
<a name="l04114"></a>04114 <span class="comment"> * @param validation_table_name The name of the validation table. </span>
<a name="l04115"></a>04115 <span class="comment"> * @param tree_table_name The name of the table where the resulting </span>
<a name="l04116"></a>04116 <span class="comment"> * DT/RF will be stored. </span>
<a name="l04117"></a>04117 <span class="comment"> * @param continuous_feature_names A comma-separated list of the names of features whose values </span>
<a name="l04118"></a>04118 <span class="comment"> * are continuous. The default is null, which means there are </span>
<a name="l04119"></a>04119 <span class="comment"> * no continuous features in the training table.</span>
<a name="l04120"></a>04120 <span class="comment"> * @param feature_col_names A comma-separated list of the names of table columns, each of</span>
<a name="l04121"></a>04121 <span class="comment"> * which defines a feature. The default value is null, which means </span>
<a name="l04122"></a>04122 <span class="comment"> * all the columns in the training table, except columns named </span>
<a name="l04123"></a>04123 <span class="comment"> * ‘id’ and ‘class’, will be used as features.</span>
<a name="l04124"></a>04124 <span class="comment"> * @param id_col_name The name of the column containing id of each point. </span>
<a name="l04125"></a>04125 <span class="comment"> * @param class_col_name The name of the column containing correct class </span>
<a name="l04126"></a>04126 <span class="comment"> * of each point. </span>
<a name="l04127"></a>04127 <span class="comment"> * @param confidence_level A statistical confidence interval of the </span>
<a name="l04128"></a>04128 <span class="comment"> * resubstitution error. </span>
<a name="l04129"></a>04129 <span class="comment"> * @param how2handle_missing_value The way to handle missing value. The valid value </span>
<a name="l04130"></a>04130 <span class="comment"> * is &#39;explicit&#39; or &#39;ignore&#39;.</span>
<a name="l04131"></a>04131 <span class="comment"> * @param max_tree_depth Maximum decision tree depth. </span>
<a name="l04132"></a>04132 <span class="comment"> * @param sampling_percentage The percentage of records sampled to train a tree.</span>
<a name="l04133"></a>04133 <span class="comment"> * If it&#39;s NULL, 0.632 bootstrap will be used</span>
<a name="l04134"></a>04134 <span class="comment"> * @param sampling_needed Whether enabling the sampling functionality. </span>
<a name="l04135"></a>04135 <span class="comment"> * @param node_prune_threshold Specifies the minimum number of samples required </span>
<a name="l04136"></a>04136 <span class="comment"> * in a child node. </span>
<a name="l04137"></a>04137 <span class="comment"> * @param node_split_threshold Specifies the minimum number of samples required </span>
<a name="l04138"></a>04138 <span class="comment"> * in a node in order for a further split </span>
<a name="l04139"></a>04139 <span class="comment"> * to be possible. </span>
<a name="l04140"></a>04140 <span class="comment"> * @param error_msg The reported error message when the result table</span>
<a name="l04141"></a>04141 <span class="comment"> * name is invalid.</span>
<a name="l04142"></a>04142 <span class="comment"> * @param verbosity &gt; 0 means this function runs in verbose mode. </span>
<a name="l04143"></a>04143 <span class="comment"> *</span>
<a name="l04144"></a>04144 <span class="comment"> * @return An instance of __train_result.</span>
<a name="l04145"></a>04145 <span class="comment"> *</span>
<a name="l04146"></a>04146 <span class="comment"> */</span>
<a name="l04147"></a>04147 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__encode_and_train
<a name="l04148"></a>04148 (
<a name="l04149"></a>04149 dt_algo_name TEXT,
<a name="l04150"></a>04150 split_criterion TEXT,
<a name="l04151"></a>04151 num_trees INT,
<a name="l04152"></a>04152 features_per_node INT,
<a name="l04153"></a>04153 training_table_name TEXT,
<a name="l04154"></a>04154 validation_table_name TEXT,
<a name="l04155"></a>04155 tree_table_name TEXT,
<a name="l04156"></a>04156 continuous_feature_names TEXT,
<a name="l04157"></a>04157 feature_col_names TEXT,
<a name="l04158"></a>04158 id_col_name TEXT,
<a name="l04159"></a>04159 class_col_name TEXT,
<a name="l04160"></a>04160 confidence_level FLOAT8,
<a name="l04161"></a>04161 how2handle_missing_value TEXT,
<a name="l04162"></a>04162 max_tree_depth INT,
<a name="l04163"></a>04163 sampling_percentage FLOAT8,
<a name="l04164"></a>04164 sampling_needed BOOL,
<a name="l04165"></a>04165 node_prune_threshold FLOAT8,
<a name="l04166"></a>04166 node_split_threshold FLOAT8,
<a name="l04167"></a>04167 error_msg TEXT,
<a name="l04168"></a>04168 verbosity INT
<a name="l04169"></a>04169 )
<a name="l04170"></a>04170 RETURNS RECORD AS $$
<a name="l04171"></a>04171 DECLARE
<a name="l04172"></a>04172 table_names TEXT[]; -- 1: encoded table; 2: meta table
<a name="l04173"></a>04173 h2hmv_routine_id INT := 1;
<a name="l04174"></a>04174 h2hmv_routine_name TEXT;
<a name="l04175"></a>04175 n_fids INT;
<a name="l04176"></a>04176 curstmt TEXT;
<a name="l04177"></a>04177 enc_tree_name TEXT;
<a name="l04178"></a>04178 cont_feature_col_names TEXT[];
<a name="l04179"></a>04179 feature_name_array TEXT[];
<a name="l04180"></a>04180 train_rs MADLIB_SCHEMA.__train_result;
<a name="l04181"></a>04181 BEGIN
<a name="l04182"></a>04182 cont_feature_col_names = MADLIB_SCHEMA.__csvstr_to_array(continuous_feature_names);
<a name="l04183"></a>04183 feature_name_array = MADLIB_SCHEMA.__csvstr_to_array(feature_col_names);
<a name="l04184"></a>04184
<a name="l04185"></a>04185 -- if the training table is an valid encoded table, then we retrieve
<a name="l04186"></a>04186 -- the relevant information from training_info table directly.
<a name="l04187"></a>04187 IF (MADLIB_SCHEMA.__is_valid_enc_table(training_table_name)) THEN
<a name="l04188"></a>04188 enc_tree_name = MADLIB_SCHEMA.__get_tree_table_name
<a name="l04189"></a>04189 (training_table_name);
<a name="l04190"></a>04190 table_names[1] = training_table_name;
<a name="l04191"></a>04191 table_names[2] = MADLIB_SCHEMA.__get_metatable_name(enc_tree_name);
<a name="l04192"></a>04192 h2hmv_routine_name = MADLIB_SCHEMA.__get_routine_name(enc_tree_name);
<a name="l04193"></a>04193 IF (h2hmv_routine_name = &#39;ignore&#39;) THEN
<a name="l04194"></a>04194 h2hmv_routine_id = 1;
<a name="l04195"></a>04195 ELSE
<a name="l04196"></a>04196 h2hmv_routine_id = 2;
<a name="l04197"></a>04197 END IF;
<a name="l04198"></a>04198
<a name="l04199"></a>04199 -- validate the metatable
<a name="l04200"></a>04200 PERFORM MADLIB_SCHEMA.__validate_metatable(table_names[2]);
<a name="l04201"></a>04201
<a name="l04202"></a>04202 n_fids = MADLIB_SCHEMA.__num_of_feature(table_names[2]);
<a name="l04203"></a>04203 PERFORM MADLIB_SCHEMA.__assert
<a name="l04204"></a>04204 (
<a name="l04205"></a>04205 features_per_node IS NULL OR
<a name="l04206"></a>04206 n_fids &gt;= features_per_node,
<a name="l04207"></a>04207 &#39;the value of features_per_node must be less than or equal to the total number &#39; ||
<a name="l04208"></a>04208 &#39;of features of the training table&#39;
<a name="l04209"></a>04209 );
<a name="l04210"></a>04210 -- create tree table and auxiliary tables
<a name="l04211"></a>04211 -- so that we can get the schema name of the table
<a name="l04212"></a>04212 PERFORM MADLIB_SCHEMA.__create_tree_tables(tree_table_name);
<a name="l04213"></a>04213 ELSE
<a name="l04214"></a>04214 -- the provided columns must be in the training table
<a name="l04215"></a>04215 PERFORM MADLIB_SCHEMA.__check_training_table
<a name="l04216"></a>04216 (
<a name="l04217"></a>04217 training_table_name,
<a name="l04218"></a>04218 cont_feature_col_names,
<a name="l04219"></a>04219 feature_name_array,
<a name="l04220"></a>04220 id_col_name,
<a name="l04221"></a>04221 class_col_name,
<a name="l04222"></a>04222 features_per_node
<a name="l04223"></a>04223 );
<a name="l04224"></a>04224
<a name="l04225"></a>04225 h2hmv_routine_name = btrim(how2handle_missing_value, &#39; &#39;);
<a name="l04226"></a>04226 IF (h2hmv_routine_name = &#39;ignore&#39;) THEN
<a name="l04227"></a>04227 h2hmv_routine_id = 1;
<a name="l04228"></a>04228 ELSE
<a name="l04229"></a>04229 h2hmv_routine_id = 2;
<a name="l04230"></a>04230 END IF;
<a name="l04231"></a>04231
<a name="l04232"></a>04232 -- create tree table and auxiliary tables
<a name="l04233"></a>04233 -- so that we can get the schema name of the table
<a name="l04234"></a>04234 PERFORM MADLIB_SCHEMA.__create_tree_tables(tree_table_name);
<a name="l04235"></a>04235
<a name="l04236"></a>04236 -- encode the training table
<a name="l04237"></a>04237 table_names = MADLIB_SCHEMA.__gen_enc_meta_names(tree_table_name, error_msg);
<a name="l04238"></a>04238 PERFORM MADLIB_SCHEMA.__encode_table
<a name="l04239"></a>04239 (
<a name="l04240"></a>04240 training_table_name,
<a name="l04241"></a>04241 lower(id_col_name),
<a name="l04242"></a>04242 feature_name_array,
<a name="l04243"></a>04243 lower(class_col_name),
<a name="l04244"></a>04244 cont_feature_col_names,
<a name="l04245"></a>04245 table_names[1],
<a name="l04246"></a>04246 table_names[2],
<a name="l04247"></a>04247 h2hmv_routine_id,
<a name="l04248"></a>04248 verbosity
<a name="l04249"></a>04249 );
<a name="l04250"></a>04250 n_fids = MADLIB_SCHEMA.__num_of_feature(table_names[2]);
<a name="l04251"></a>04251 END IF;
<a name="l04252"></a>04252
<a name="l04253"></a>04253 IF (sampling_needed) THEN
<a name="l04254"></a>04254 IF (features_per_node IS NULL) THEN
<a name="l04255"></a>04255 n_fids = round(sqrt(n_fids) - 0.5)::INT + 1;
<a name="l04256"></a>04256 ELSE
<a name="l04257"></a>04257 n_fids = features_per_node;
<a name="l04258"></a>04258 END IF;
<a name="l04259"></a>04259 END IF;
<a name="l04260"></a>04260
<a name="l04261"></a>04261 IF (verbosity &gt; 0) THEN
<a name="l04262"></a>04262 RAISE INFO &#39;features_per_node: %&#39;, n_fids;
<a name="l04263"></a>04263 END IF;
<a name="l04264"></a>04264
<a name="l04265"></a>04265 -- insert data to the training_info table
<a name="l04266"></a>04266 PERFORM MADLIB_SCHEMA.__insert_into_traininginfo
<a name="l04267"></a>04267 (
<a name="l04268"></a>04268 dt_algo_name,
<a name="l04269"></a>04269 tree_table_name,
<a name="l04270"></a>04270 training_table_name,
<a name="l04271"></a>04271 table_names[2],
<a name="l04272"></a>04272 table_names[1],
<a name="l04273"></a>04273 validation_table_name,
<a name="l04274"></a>04274 h2hmv_routine_name,
<a name="l04275"></a>04275 split_criterion,
<a name="l04276"></a>04276 sampling_percentage,
<a name="l04277"></a>04277 n_fids,
<a name="l04278"></a>04278 num_trees
<a name="l04279"></a>04279 );
<a name="l04280"></a>04280
<a name="l04281"></a>04281 -- call the tree grow engine
<a name="l04282"></a>04282 train_rs = MADLIB_SCHEMA.__train_tree
<a name="l04283"></a>04283 (
<a name="l04284"></a>04284 split_criterion,
<a name="l04285"></a>04285 num_trees,
<a name="l04286"></a>04286 n_fids ,
<a name="l04287"></a>04287 table_names[1],
<a name="l04288"></a>04288 table_names[2],
<a name="l04289"></a>04289 tree_table_name,
<a name="l04290"></a>04290 validation_table_name,
<a name="l04291"></a>04291 &#39;id&#39;,
<a name="l04292"></a>04292 &#39;class&#39;,
<a name="l04293"></a>04293 confidence_level,
<a name="l04294"></a>04294 max_tree_depth,
<a name="l04295"></a>04295 sampling_percentage,
<a name="l04296"></a>04296 node_prune_threshold,
<a name="l04297"></a>04297 node_split_threshold,
<a name="l04298"></a>04298 sampling_needed,
<a name="l04299"></a>04299 h2hmv_routine_id,
<a name="l04300"></a>04300 verbosity
<a name="l04301"></a>04301 );
<a name="l04302"></a>04302
<a name="l04303"></a>04303 RETURN train_rs;
<a name="l04304"></a>04304 END
<a name="l04305"></a>04305 $$ LANGUAGE PLPGSQL STABLE;
</pre></div></div>
</div>
<div id="nav-path" class="navpath">
<ul>
<li class="navelem"><a class="el" href="dt_8sql__in.html">dt.sql_in</a> </li>
<!-- window showing the filter options -->
<div id="MSearchSelectWindow"
onmouseover="return searchBox.OnSearchSelectShow()"
onmouseout="return searchBox.OnSearchSelectHide()"
onkeydown="return searchBox.OnSearchSelectKey(event)">
<a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(0)"><span class="SelectionMark">&#160;</span>All</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(1)"><span class="SelectionMark">&#160;</span>Files</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(2)"><span class="SelectionMark">&#160;</span>Functions</a></div>
<!-- iframe showing the search results (closed by default) -->
<div id="MSearchResultsWindow">
<iframe src="javascript:void(0)" frameborder="0"
name="MSearchResults" id="MSearchResults">
</iframe>
</div>
<li class="footer">Generated on Fri May 10 2013 01:37:13 for MADlib by
<a href="http://www.doxygen.org/index.html">
<img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.7.5.1 </li>
</ul>
</div>
</body>
</html>