blob: 5c5ee986e1bae59885885baec0ddbea07e9402f3 [file] [log] [blame]
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
<title>MADlib: dt_preproc.sql_in Source File</title>
<link href="tabs.css" rel="stylesheet" type="text/css"/>
<link href="doxygen.css" rel="stylesheet" type="text/css" />
<link href="navtree.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript" src="jquery.js"></script>
<script type="text/javascript" src="resize.js"></script>
<script type="text/javascript" src="navtree.js"></script>
<script type="text/javascript">
$(document).ready(initResizable);
</script>
<link href="search/search.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript" src="search/search.js"></script>
<script type="text/javascript">
$(document).ready(function() { searchBox.OnSelectItem(0); });
</script>
<script src="../mathjax/MathJax.js">
MathJax.Hub.Config({
extensions: ["tex2jax.js", "TeX/AMSmath.js", "TeX/AMSsymbols.js"],
jax: ["input/TeX","output/HTML-CSS"],
});
</script>
</head>
<body>
<div id="top"><!-- do not remove this div! -->
<div id="titlearea">
<table cellspacing="0" cellpadding="0">
<tbody>
<tr style="height: 56px;">
<td style="padding-left: 0.5em;">
<div id="projectname">MADlib
&#160;<span id="projectnumber">0.6</span> <span style="font-size:10pt; font-style:italic"><a href="../latest/./dt__preproc_8sql__in_source.html"> A newer version is available</a></span>
</div>
<div id="projectbrief">User Documentation</div>
</td>
</tr>
</tbody>
</table>
</div>
<!-- Generated by Doxygen 1.7.5.1 -->
<script type="text/javascript">
var searchBox = new SearchBox("searchBox", "search",false,'Search');
</script>
<script type="text/javascript" src="dynsections.js"></script>
<div id="navrow1" class="tabs">
<ul class="tablist">
<li><a href="index.html"><span>Main&#160;Page</span></a></li>
<li><a href="modules.html"><span>Modules</span></a></li>
<li class="current"><a href="files.html"><span>Files</span></a></li>
<li>
<div id="MSearchBox" class="MSearchBoxInactive">
<span class="left">
<img id="MSearchSelect" src="search/mag_sel.png"
onmouseover="return searchBox.OnSearchSelectShow()"
onmouseout="return searchBox.OnSearchSelectHide()"
alt=""/>
<input type="text" id="MSearchField" value="Search" accesskey="S"
onfocus="searchBox.OnSearchFieldFocus(true)"
onblur="searchBox.OnSearchFieldFocus(false)"
onkeyup="searchBox.OnSearchFieldChange(event)"/>
</span><span class="right">
<a id="MSearchClose" href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" border="0" src="search/close.png" alt=""/></a>
</span>
</div>
</li>
</ul>
</div>
<div id="navrow2" class="tabs2">
<ul class="tablist">
<li><a href="files.html"><span>File&#160;List</span></a></li>
<li><a href="globals.html"><span>File&#160;Members</span></a></li>
</ul>
</div>
</div>
<div id="side-nav" class="ui-resizable side-nav-resizable">
<div id="nav-tree">
<div id="nav-tree-contents">
</div>
</div>
<div id="splitbar" style="-moz-user-select:none;"
class="ui-resizable-handle">
</div>
</div>
<script type="text/javascript">
initNavTree('dt__preproc_8sql__in.html','');
</script>
<div id="doc-content">
<div class="header">
<div class="headertitle">
<div class="title">dt_preproc.sql_in</div> </div>
</div>
<div class="contents">
<a href="dt__preproc_8sql__in.html">Go to the documentation of this file.</a><div class="fragment"><pre class="fragment"><a name="l00001"></a>00001 <span class="comment">/* ----------------------------------------------------------------------- */</span><span class="comment">/** </span>
<a name="l00002"></a>00002 <span class="comment"> *</span>
<a name="l00003"></a>00003 <span class="comment"> * @file dt_preproc.sql_in</span>
<a name="l00004"></a>00004 <span class="comment"> *</span>
<a name="l00005"></a>00005 <span class="comment"> * @brief Functions used in C4.5 and random forest for data preprocessing.</span>
<a name="l00006"></a>00006 <span class="comment"> *</span>
<a name="l00007"></a>00007 <span class="comment"> * @create April 5, 2012</span>
<a name="l00008"></a>00008 <span class="comment"> * @modified July 19, 2012</span>
<a name="l00009"></a>00009 <span class="comment"> *</span>
<a name="l00010"></a>00010 <span class="comment"> */</span><span class="comment">/* ----------------------------------------------------------------------- */</span>
<a name="l00011"></a>00011
<a name="l00012"></a>00012
<a name="l00013"></a>00013 m4_include(`SQLCommon.m4<span class="stringliteral">&#39;)</span>
<a name="l00014"></a>00014 <span class="stringliteral"></span>
<a name="l00015"></a>00015 <span class="stringliteral">/* Own macro definitions */</span>
<a name="l00016"></a>00016 <span class="stringliteral">m4_ifelse(</span>
<a name="l00017"></a>00017 <span class="stringliteral"> m4_eval(</span>
<a name="l00018"></a>00018 <span class="stringliteral"> m4_ifdef(`__GREENPLUM__&#39;</span>, 1, 0) &amp;&amp;
<a name="l00019"></a>00019 __DBMS_VERSION_MAJOR__ * 100 + __DBMS_VERSION_MINOR__ &lt; 401
<a name="l00020"></a>00020 ), 1,
<a name="l00021"></a>00021 `m4_define(`__GREENPLUM_PRE_4_1__<span class="charliteral">&#39;)&#39;</span>
<a name="l00022"></a>00022 )
<a name="l00023"></a>00023 m4_ifelse(
<a name="l00024"></a>00024 m4_eval(
<a name="l00025"></a>00025 m4_ifdef(`__POSTGRESQL__<span class="stringliteral">&#39;, 1, 0) &amp;&amp;</span>
<a name="l00026"></a>00026 <span class="stringliteral"> __DBMS_VERSION_MAJOR__ &lt; 9</span>
<a name="l00027"></a>00027 <span class="stringliteral"> ), 1,</span>
<a name="l00028"></a>00028 <span class="stringliteral"> `m4_define(`__POSTGRESQL_PRE_9_0__&#39;</span>)<span class="stringliteral">&#39;</span>
<a name="l00029"></a>00029 <span class="stringliteral">)</span>
<a name="l00030"></a>00030 <span class="stringliteral">m4_ifelse(</span>
<a name="l00031"></a>00031 <span class="stringliteral"> m4_eval(</span>
<a name="l00032"></a>00032 <span class="stringliteral"> m4_ifdef(`__GREENPLUM__&#39;</span>, 1, 0) &amp;&amp;
<a name="l00033"></a>00033 __DBMS_VERSION_MAJOR__ * 10000 +
<a name="l00034"></a>00034 __DBMS_VERSION_MINOR__ * 100 +
<a name="l00035"></a>00035 __DBMS_VERSION_PATCH__ &gt;= 40201
<a name="l00036"></a>00036 ), 1,
<a name="l00037"></a>00037 `m4_define(`__GREENPLUM_GE_4_2_1__<span class="charliteral">&#39;)&#39;</span>
<a name="l00038"></a>00038 )
<a name="l00039"></a>00039
<a name="l00040"></a>00040 <span class="comment">/*</span>
<a name="l00041"></a>00041 <span class="comment"> * The file contains the functions to encode a training/classification table for </span>
<a name="l00042"></a>00042 <span class="comment"> * C4.5 and random forest (RF). Given a training table, we encode it into 4 tables: </span>
<a name="l00043"></a>00043 <span class="comment"> * + A table that contains the distinct values and their assigned IDs for all</span>
<a name="l00044"></a>00044 <span class="comment"> * features. We call it the Key-Value(KV) table for features.</span>
<a name="l00045"></a>00045 <span class="comment"> * + A table that contains the distinct labels and their assigned IDs for the </span>
<a name="l00046"></a>00046 <span class="comment"> * class column. We call it the KV table for class.</span>
<a name="l00047"></a>00047 <span class="comment"> * + A table that contains metadata descriptions about the columns of the training</span>
<a name="l00048"></a>00048 <span class="comment"> * table. We call it the metatable.</span>
<a name="l00049"></a>00049 <span class="comment"> * + A table that contains an encoded version of the training table using the </span>
<a name="l00050"></a>00050 <span class="comment"> * KV tables. We call it the encoded table.</span>
<a name="l00051"></a>00051 <span class="comment"> *</span>
<a name="l00052"></a>00052 <span class="comment"> * For a classification table, we only need the first three tables. We will use</span>
<a name="l00053"></a>00053 <span class="comment"> * Golf dataset as an example to illustrate the generated tables:</span>
<a name="l00054"></a>00054 <span class="comment"> *</span>
<a name="l00055"></a>00055 <span class="comment"> * testdb=# select * from golf order by id;</span>
<a name="l00056"></a>00056 <span class="comment"> * id | outlook | temperature | humidity | windy | class </span>
<a name="l00057"></a>00057 <span class="comment"> * ----+----------+-------------+----------+--------+--------------</span>
<a name="l00058"></a>00058 <span class="comment"> * 1 | sunny | 85 | 85 | false | Do not Play</span>
<a name="l00059"></a>00059 <span class="comment"> * 2 | sunny | 80 | 90 | true | Do not Play</span>
<a name="l00060"></a>00060 <span class="comment"> * 3 | overcast | 83 | 78 | false | Play</span>
<a name="l00061"></a>00061 <span class="comment"> * 4 | rain | 70 | 96 | false | Play</span>
<a name="l00062"></a>00062 <span class="comment"> * 5 | rain | 68 | 80 | false | Play</span>
<a name="l00063"></a>00063 <span class="comment"> * 6 | rain | 65 | 70 | true | Do not Play</span>
<a name="l00064"></a>00064 <span class="comment"> * 7 | overcast | 64 | 65 | true | Play</span>
<a name="l00065"></a>00065 <span class="comment"> * 8 | sunny | 72 | 95 | false | Do not Play</span>
<a name="l00066"></a>00066 <span class="comment"> * 9 | sunny | 69 | 70 | false | Play</span>
<a name="l00067"></a>00067 <span class="comment"> * 10 | rain | 75 | 80 | false | Play</span>
<a name="l00068"></a>00068 <span class="comment"> * 11 | sunny | 75 | 70 | true | Play</span>
<a name="l00069"></a>00069 <span class="comment"> * 12 | overcast | 72 | 90 | true | Play</span>
<a name="l00070"></a>00070 <span class="comment"> * 13 | overcast | 81 | 75 | false | Play</span>
<a name="l00071"></a>00071 <span class="comment"> * 14 | rain | 71 | 80 | true | Do not Play</span>
<a name="l00072"></a>00072 <span class="comment"> * (14 rows)</span>
<a name="l00073"></a>00073 <span class="comment"> *</span>
<a name="l00074"></a>00074 <span class="comment"> *</span>
<a name="l00075"></a>00075 <span class="comment"> * The metatable contains the information of the columns in the training table.</span>
<a name="l00076"></a>00076 <span class="comment"> * For each column, it has a record whose structure is defined as:</span>
<a name="l00077"></a>00077 <span class="comment"> *</span>
<a name="l00078"></a>00078 <span class="comment"> * +id The ID assigned to a feature/class/id column. For the class </span>
<a name="l00079"></a>00079 <span class="comment"> * colum,it&#39;s 0. To be determistic, the IDs for feature columns</span>
<a name="l00080"></a>00080 <span class="comment"> * starts at 1 and are assigned according to the alphabet order </span>
<a name="l00081"></a>00081 <span class="comment"> * of the column names. The ID for the id column is the largest</span>
<a name="l00082"></a>00082 <span class="comment"> * feature ID plus one. </span>
<a name="l00083"></a>00083 <span class="comment"> * +column_name The name of the class/feature/id column.</span>
<a name="l00084"></a>00084 <span class="comment"> * +column_type &#39;c&#39; means the column is a class. </span>
<a name="l00085"></a>00085 <span class="comment"> * &#39;f&#39; means it&#39;s a feature column.</span>
<a name="l00086"></a>00086 <span class="comment"> * &#39;i&#39; means it&#39;s an id column.</span>
<a name="l00087"></a>00087 <span class="comment"> * +is_cont &#39;t&#39; means the feature is continuous.</span>
<a name="l00088"></a>00088 <span class="comment"> * &#39;f&#39; means it&#39;s discrete.</span>
<a name="l00089"></a>00089 <span class="comment"> * +table_oid The OID of the KV table for features/class. </span>
<a name="l00090"></a>00090 <span class="comment"> * For the id column, there is no KV table.</span>
<a name="l00091"></a>00091 <span class="comment"> * +num_dist_value The number of distinct values for a feature/class column. </span>
<a name="l00092"></a>00092 <span class="comment"> *</span>
<a name="l00093"></a>00093 <span class="comment"> * The metatable for the Golf dataset looks like this:</span>
<a name="l00094"></a>00094 <span class="comment"> * testdb=# select * from golf_meta order by id;</span>
<a name="l00095"></a>00095 <span class="comment"> * id | column_name | column_type | is_cont | table_oid | num_dist_value </span>
<a name="l00096"></a>00096 <span class="comment"> * ----+-------------+-------------+---------+-----------+----------------</span>
<a name="l00097"></a>00097 <span class="comment"> * 0 | class | c | f | 787672 | 2</span>
<a name="l00098"></a>00098 <span class="comment"> * 1 | humidity | f | t | 787749 | 9</span>
<a name="l00099"></a>00099 <span class="comment"> * 2 | outlook | f | f | 787749 | 3</span>
<a name="l00100"></a>00100 <span class="comment"> * 3 | temperature | f | t | 787749 | 12</span>
<a name="l00101"></a>00101 <span class="comment"> * 4 | windy | f | f | 787749 | 2</span>
<a name="l00102"></a>00102 <span class="comment"> * 5 | id | i | f | | </span>
<a name="l00103"></a>00103 <span class="comment"> * (6 rows)</span>
<a name="l00104"></a>00104 <span class="comment"> * </span>
<a name="l00105"></a>00105 <span class="comment"> * The KV table for features contains a record for each distinct value. The record</span>
<a name="l00106"></a>00106 <span class="comment"> * structure is:</span>
<a name="l00107"></a>00107 <span class="comment"> * +fid The ID assigned to a feature.</span>
<a name="l00108"></a>00108 <span class="comment"> * +fval For a discrete feature, it&#39;s the distinct value. </span>
<a name="l00109"></a>00109 <span class="comment"> * For a continuous feature, it&#39;s NULL.</span>
<a name="l00110"></a>00110 <span class="comment"> * +code For a discrete feature, it&#39;s the assigned key. </span>
<a name="l00111"></a>00111 <span class="comment"> * For a continuous feature, it&#39;s the average value.</span>
<a name="l00112"></a>00112 <span class="comment"> *</span>
<a name="l00113"></a>00113 <span class="comment"> * testdb=# select * from golf_kv_features order by fid, code;</span>
<a name="l00114"></a>00114 <span class="comment"> * fid | fval | code </span>
<a name="l00115"></a>00115 <span class="comment"> * -----+----------+------------------</span>
<a name="l00116"></a>00116 <span class="comment"> * 1 | | 80.2857142857143</span>
<a name="l00117"></a>00117 <span class="comment"> * 2 | overcast | 1</span>
<a name="l00118"></a>00118 <span class="comment"> * 2 | rain | 2</span>
<a name="l00119"></a>00119 <span class="comment"> * 2 | sunny | 3</span>
<a name="l00120"></a>00120 <span class="comment"> * 3 | | 73.5714285714286</span>
<a name="l00121"></a>00121 <span class="comment"> * 4 | false | 1</span>
<a name="l00122"></a>00122 <span class="comment"> * 4 | true | 2</span>
<a name="l00123"></a>00123 <span class="comment"> * (7 rows)</span>
<a name="l00124"></a>00124 <span class="comment"> *</span>
<a name="l00125"></a>00125 <span class="comment"> * The KV table for class labels contains a record for each label. The record</span>
<a name="l00126"></a>00126 <span class="comment"> * structure is the same as the KV table for features.</span>
<a name="l00127"></a>00127 <span class="comment"> * testdb=# select * from golf_kv_class order by fid, code;</span>
<a name="l00128"></a>00128 <span class="comment"> * fid | fval | code </span>
<a name="l00129"></a>00129 <span class="comment"> * -----+--------------+------</span>
<a name="l00130"></a>00130 <span class="comment"> * 0 | Do not Play | 1</span>
<a name="l00131"></a>00131 <span class="comment"> * 0 | Play | 2</span>
<a name="l00132"></a>00132 <span class="comment"> * </span>
<a name="l00133"></a>00133 <span class="comment"> * The encoded table has a record for each cell in the training table. The record</span>
<a name="l00134"></a>00134 <span class="comment"> * structure is:</span>
<a name="l00135"></a>00135 <span class="comment"> * +id The ID from the training table. </span>
<a name="l00136"></a>00136 <span class="comment"> * +fid The ID assigned to a feature</span>
<a name="l00137"></a>00137 <span class="comment"> * +fval For a discrete feature, it&#39;s the key. </span>
<a name="l00138"></a>00138 <span class="comment"> * For a continuous feature, it&#39;s the original feature value.</span>
<a name="l00139"></a>00139 <span class="comment"> * +is_cont &#39;t&#39; if the feature is continuous, or &#39;f&#39; for the discrete one.</span>
<a name="l00140"></a>00140 <span class="comment"> * +class The encoded value of the class label.</span>
<a name="l00141"></a>00141 <span class="comment"> * </span>
<a name="l00142"></a>00142 <span class="comment"> * For Golf dataset, the vertical encoded table looks like this:</span>
<a name="l00143"></a>00143 <span class="comment"> * testdb=# select * from golf_ed order by fid, id;</span>
<a name="l00144"></a>00144 <span class="comment"> * id | fid | fval | is_cont | class </span>
<a name="l00145"></a>00145 <span class="comment"> * ----+-----+------+---------+-------</span>
<a name="l00146"></a>00146 <span class="comment"> * 1 | 1 | 85 | t | 1</span>
<a name="l00147"></a>00147 <span class="comment"> * 2 | 1 | 90 | t | 1</span>
<a name="l00148"></a>00148 <span class="comment"> * 3 | 1 | 78 | t | 2</span>
<a name="l00149"></a>00149 <span class="comment"> * 4 | 1 | 96 | t | 2</span>
<a name="l00150"></a>00150 <span class="comment"> * 5 | 1 | 80 | t | 2</span>
<a name="l00151"></a>00151 <span class="comment"> * 6 | 1 | 70 | t | 1</span>
<a name="l00152"></a>00152 <span class="comment"> * 7 | 1 | 65 | t | 2</span>
<a name="l00153"></a>00153 <span class="comment"> * 8 | 1 | 95 | t | 1</span>
<a name="l00154"></a>00154 <span class="comment"> * 9 | 1 | 70 | t | 2</span>
<a name="l00155"></a>00155 <span class="comment"> * 10 | 1 | 80 | t | 2</span>
<a name="l00156"></a>00156 <span class="comment"> * 11 | 1 | 70 | t | 2</span>
<a name="l00157"></a>00157 <span class="comment"> * 12 | 1 | 90 | t | 2</span>
<a name="l00158"></a>00158 <span class="comment"> * 13 | 1 | 75 | t | 2</span>
<a name="l00159"></a>00159 <span class="comment"> * 14 | 1 | 80 | t | 1</span>
<a name="l00160"></a>00160 <span class="comment"> * 1 | 2 | 3 | f | 1</span>
<a name="l00161"></a>00161 <span class="comment"> * 2 | 2 | 3 | f | 1</span>
<a name="l00162"></a>00162 <span class="comment"> * 3 | 2 | 1 | f | 2</span>
<a name="l00163"></a>00163 <span class="comment"> * 4 | 2 | 2 | f | 2</span>
<a name="l00164"></a>00164 <span class="comment"> * 5 | 2 | 2 | f | 2</span>
<a name="l00165"></a>00165 <span class="comment"> * 6 | 2 | 2 | f | 1</span>
<a name="l00166"></a>00166 <span class="comment"> * 7 | 2 | 1 | f | 2</span>
<a name="l00167"></a>00167 <span class="comment"> * 8 | 2 | 3 | f | 1</span>
<a name="l00168"></a>00168 <span class="comment"> * 9 | 2 | 3 | f | 2</span>
<a name="l00169"></a>00169 <span class="comment"> * 10 | 2 | 2 | f | 2</span>
<a name="l00170"></a>00170 <span class="comment"> * 11 | 2 | 3 | f | 2</span>
<a name="l00171"></a>00171 <span class="comment"> * 12 | 2 | 1 | f | 2</span>
<a name="l00172"></a>00172 <span class="comment"> * 13 | 2 | 1 | f | 2</span>
<a name="l00173"></a>00173 <span class="comment"> * 14 | 2 | 2 | f | 1</span>
<a name="l00174"></a>00174 <span class="comment"> * 1 | 3 | 85 | t | 1</span>
<a name="l00175"></a>00175 <span class="comment"> * 2 | 3 | 80 | t | 1</span>
<a name="l00176"></a>00176 <span class="comment"> * 3 | 3 | 83 | t | 2</span>
<a name="l00177"></a>00177 <span class="comment"> * 4 | 3 | 70 | t | 2</span>
<a name="l00178"></a>00178 <span class="comment"> * 5 | 3 | 68 | t | 2</span>
<a name="l00179"></a>00179 <span class="comment"> * 6 | 3 | 65 | t | 1</span>
<a name="l00180"></a>00180 <span class="comment"> * 7 | 3 | 64 | t | 2</span>
<a name="l00181"></a>00181 <span class="comment"> * 8 | 3 | 72 | t | 1</span>
<a name="l00182"></a>00182 <span class="comment"> * 9 | 3 | 69 | t | 2</span>
<a name="l00183"></a>00183 <span class="comment"> * 10 | 3 | 75 | t | 2</span>
<a name="l00184"></a>00184 <span class="comment"> * 11 | 3 | 75 | t | 2</span>
<a name="l00185"></a>00185 <span class="comment"> * 12 | 3 | 72 | t | 2</span>
<a name="l00186"></a>00186 <span class="comment"> * 13 | 3 | 81 | t | 2</span>
<a name="l00187"></a>00187 <span class="comment"> * 14 | 3 | 71 | t | 1</span>
<a name="l00188"></a>00188 <span class="comment"> * 1 | 4 | 1 | f | 1</span>
<a name="l00189"></a>00189 <span class="comment"> * 2 | 4 | 2 | f | 1</span>
<a name="l00190"></a>00190 <span class="comment"> * 3 | 4 | 1 | f | 2</span>
<a name="l00191"></a>00191 <span class="comment"> * 4 | 4 | 1 | f | 2</span>
<a name="l00192"></a>00192 <span class="comment"> * 5 | 4 | 1 | f | 2</span>
<a name="l00193"></a>00193 <span class="comment"> * 6 | 4 | 2 | f | 1</span>
<a name="l00194"></a>00194 <span class="comment"> * 7 | 4 | 2 | f | 2</span>
<a name="l00195"></a>00195 <span class="comment"> * 8 | 4 | 1 | f | 1</span>
<a name="l00196"></a>00196 <span class="comment"> * 9 | 4 | 1 | f | 2</span>
<a name="l00197"></a>00197 <span class="comment"> * 10 | 4 | 1 | f | 2</span>
<a name="l00198"></a>00198 <span class="comment"> * 11 | 4 | 2 | f | 2</span>
<a name="l00199"></a>00199 <span class="comment"> * 12 | 4 | 2 | f | 2</span>
<a name="l00200"></a>00200 <span class="comment"> * 13 | 4 | 1 | f | 2</span>
<a name="l00201"></a>00201 <span class="comment"> * 14 | 4 | 2 | f | 1</span>
<a name="l00202"></a>00202 <span class="comment"> * (56 rows)</span>
<a name="l00203"></a>00203 <span class="comment"> *</span>
<a name="l00204"></a>00204 <span class="comment"> * On databases that support compression, we can leverage that feature</span>
<a name="l00205"></a>00205 <span class="comment"> * to reduce the space required for keeping the encoded table.</span>
<a name="l00206"></a>00206 <span class="comment"> *</span>
<a name="l00207"></a>00207 <span class="comment"> * For classification, we will use the metatable and KV tables to encode </span>
<a name="l00208"></a>00208 <span class="comment"> * the table (horizontal table) to be classified into some like this: </span>
<a name="l00209"></a>00209 <span class="comment"> *</span>
<a name="l00210"></a>00210 <span class="comment"> * testdb# select * from golf_ed order by id;</span>
<a name="l00211"></a>00211 <span class="comment"> * id | fvals | class </span>
<a name="l00212"></a>00212 <span class="comment"> * ----+-------------+-------</span>
<a name="l00213"></a>00213 <span class="comment"> * 1 | {85,3,85,1} | 1</span>
<a name="l00214"></a>00214 <span class="comment"> * 2 | {90,3,80,2} | 1</span>
<a name="l00215"></a>00215 <span class="comment"> * 3 | {78,1,83,1} | 2</span>
<a name="l00216"></a>00216 <span class="comment"> * 4 | {96,2,70,1} | 2</span>
<a name="l00217"></a>00217 <span class="comment"> * 5 | {80,2,68,1} | 2</span>
<a name="l00218"></a>00218 <span class="comment"> * 6 | {70,2,65,2} | 1</span>
<a name="l00219"></a>00219 <span class="comment"> * 7 | {65,1,64,2} | 2</span>
<a name="l00220"></a>00220 <span class="comment"> * 8 | {95,3,72,1} | 1</span>
<a name="l00221"></a>00221 <span class="comment"> * 9 | {70,3,69,1} | 2</span>
<a name="l00222"></a>00222 <span class="comment"> * 10 | {80,2,75,1} | 2</span>
<a name="l00223"></a>00223 <span class="comment"> * 11 | {70,3,75,2} | 2</span>
<a name="l00224"></a>00224 <span class="comment"> * 12 | {90,1,72,2} | 2</span>
<a name="l00225"></a>00225 <span class="comment"> * 13 | {75,1,81,1} | 2</span>
<a name="l00226"></a>00226 <span class="comment"> * 14 | {80,2,71,2} | 1</span>
<a name="l00227"></a>00227 <span class="comment"> * (14 rows)</span>
<a name="l00228"></a>00228 <span class="comment"> *</span>
<a name="l00229"></a>00229 <span class="comment"> * In general, each record in the new encoded table has the following structure:</span>
<a name="l00230"></a>00230 <span class="comment"> * +id The ID from the classification table.</span>
<a name="l00231"></a>00231 <span class="comment"> * +fvals An array contains all the features&#39; values for a given ID.</span>
<a name="l00232"></a>00232 <span class="comment"> * For a discrete feature, the element in the array is the key.</span>
<a name="l00233"></a>00233 <span class="comment"> * For a continuous feature, it&#39;s the original value.</span>
<a name="l00234"></a>00234 <span class="comment"> * +class The encoded value of a class label.</span>
<a name="l00235"></a>00235 <span class="comment"> *</span>
<a name="l00236"></a>00236 <span class="comment"> */</span>
<a name="l00237"></a>00237
<a name="l00238"></a>00238
<a name="l00239"></a>00239 <span class="comment">/*</span>
<a name="l00240"></a>00240 <span class="comment"> * The UDT for keeping the time for each step of the encoding procedure.</span>
<a name="l00241"></a>00241 <span class="comment"> *</span>
<a name="l00242"></a>00242 <span class="comment"> * pre_proc_time The time of pre-processing.</span>
<a name="l00243"></a>00243 <span class="comment"> * breakup_tbl_time The time of breaking up the training table.</span>
<a name="l00244"></a>00244 <span class="comment"> * gen_kv_time The time of generating KV-table for </span>
<a name="l00245"></a>00245 <span class="comment"> * features/class. </span>
<a name="l00246"></a>00246 <span class="comment"> * gen_enc_time The time of generating encoded table.</span>
<a name="l00247"></a>00247 <span class="comment"> * post_proc_time The time of post-processing.</span>
<a name="l00248"></a>00248 <span class="comment"> *</span>
<a name="l00249"></a>00249 <span class="comment"> */</span>
<a name="l00250"></a>00250 DROP TYPE IF EXISTS MADLIB_SCHEMA.__enc_tbl_result;
<a name="l00251"></a>00251 CREATE TYPE MADLIB_SCHEMA.__enc_tbl_result AS
<a name="l00252"></a>00252 (
<a name="l00253"></a>00253 pre_proc_time INTERVAL,
<a name="l00254"></a>00254 breakup_tbl_time INTERVAL,
<a name="l00255"></a>00255 gen_kv_time INTERVAL,
<a name="l00256"></a>00256 gen_enc_time INTERVAL,
<a name="l00257"></a>00257 post_proc_time INTERVAL
<a name="l00258"></a>00258 );
<a name="l00259"></a>00259
<a name="l00260"></a>00260
<a name="l00261"></a>00261 <span class="comment">/*</span>
<a name="l00262"></a>00262 <span class="comment"> * @brief Check if the input table has unsupported data type or not.</span>
<a name="l00263"></a>00263 <span class="comment"> * Check if the id column of input table has duplicated value or not.</span>
<a name="l00264"></a>00264 <span class="comment"> *</span>
<a name="l00265"></a>00265 <span class="comment"> * @param full_table_name The full table name.</span>
<a name="l00266"></a>00266 <span class="comment"> * @param feature_columns The array including all feature names.</span>
<a name="l00267"></a>00267 <span class="comment"> * @param id_column The name of the ID column. </span>
<a name="l00268"></a>00268 <span class="comment"> * @param class_column The name of the class column. </span>
<a name="l00269"></a>00269 <span class="comment"> *</span>
<a name="l00270"></a>00270 <span class="comment"> * @return If the table has unsupported data types, then raise exception</span>
<a name="l00271"></a>00271 <span class="comment"> * otherwise return nothing.</span>
<a name="l00272"></a>00272 <span class="comment"> *</span>
<a name="l00273"></a>00273 <span class="comment"> */</span>
<a name="l00274"></a>00274 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__validate_input_table
<a name="l00275"></a>00275 (
<a name="l00276"></a>00276 full_table_name TEXT,
<a name="l00277"></a>00277 feature_columns TEXT[],
<a name="l00278"></a>00278 id_column TEXT,
<a name="l00279"></a>00279 class_column TEXT
<a name="l00280"></a>00280 )
<a name="l00281"></a>00281 RETURNS <span class="keywordtype">void</span> AS $$
<a name="l00282"></a>00282 DECLARE
<a name="l00283"></a>00283 rec RECORD;
<a name="l00284"></a>00284 stmt TEXT;
<a name="l00285"></a>00285 all_columns TEXT := &#39;&#39;;
<a name="l00286"></a>00286 index INT;
<a name="l00287"></a>00287 BEGIN
<a name="l00288"></a>00288 -- find the first (LIMIT 1) unsupported data type if the input table has.
<a name="l00289"></a>00289 stmt= &#39;SELECT atttypid
<a name="l00290"></a>00290 FROM pg_attribute
<a name="l00291"></a>00291 WHERE attrelid =&#39;||quote_literal(full_table_name)||&#39;::regclass AND
<a name="l00292"></a>00292 attnum &gt; 0 AND
<a name="l00293"></a>00293 (not attisdropped) AND
<a name="l00294"></a>00294 atttypid NOT IN
<a name="l00295"></a>00295 (
<a name="l00296"></a>00296 SELECT unnest
<a name="l00297"></a>00297 (
<a name="l00298"></a>00298 ARRAY[
<a name="l00299"></a>00299 &#39;&#39;SMALLINT&#39;&#39;::regtype::oid,
<a name="l00300"></a>00300 &#39;&#39;INT&#39;&#39;::regtype::oid,
<a name="l00301"></a>00301 &#39;&#39;BIGINT&#39;&#39;::regtype::oid,
<a name="l00302"></a>00302 &#39;&#39;FLOAT8&#39;&#39;::regtype::oid,
<a name="l00303"></a>00303 &#39;&#39;REAL&#39;&#39;::regtype::oid,
<a name="l00304"></a>00304 &#39;&#39;DECIMAL&#39;&#39;::regtype::oid,
<a name="l00305"></a>00305 &#39;&#39;INET&#39;&#39;::regtype::oid,
<a name="l00306"></a>00306 &#39;&#39;CIDR&#39;&#39;::regtype::oid,
<a name="l00307"></a>00307 &#39;&#39;MACADDR&#39;&#39;::regtype::oid,
<a name="l00308"></a>00308 &#39;&#39;BOOLEAN&#39;&#39;::regtype::oid,
<a name="l00309"></a>00309 &#39;&#39;CHAR&#39;&#39;::regtype::oid,
<a name="l00310"></a>00310 &#39;&#39;VARCHAR&#39;&#39;::regtype::oid,
<a name="l00311"></a>00311 &#39;&#39;TEXT&#39;&#39;::regtype::oid,
<a name="l00312"></a>00312 &#39;&#39;&quot;<span class="keywordtype">char</span>&quot;&#39;&#39;::regtype::oid,
<a name="l00313"></a>00313 &#39;&#39;DATE&#39;&#39;::regtype::oid,
<a name="l00314"></a>00314 &#39;&#39;TIME&#39;&#39;::regtype::oid,
<a name="l00315"></a>00315 &#39;&#39;TIMETZ&#39;&#39;::regtype::oid,
<a name="l00316"></a>00316 &#39;&#39;TIMESTAMP&#39;&#39;::regtype::oid,
<a name="l00317"></a>00317 &#39;&#39;TIMESTAMPTZ&#39;&#39;::regtype::oid,
<a name="l00318"></a>00318 &#39;&#39;INTERVAL&#39;&#39;::regtype::oid
<a name="l00319"></a>00319 ]
<a name="l00320"></a>00320 )
<a name="l00321"></a>00321 ) &#39;;
<a name="l00322"></a>00322
<a name="l00323"></a>00323 IF (feature_columns IS NOT NULL) THEN
<a name="l00324"></a>00324 -- If user do not specify feature columns, we use all those columns.
<a name="l00325"></a>00325 -- Otherwise, we just need to check those specified columns.
<a name="l00326"></a>00326 index = array_lower(feature_columns, 1);
<a name="l00327"></a>00327 WHILE (index &lt;= array_upper(feature_columns, 1)) LOOP
<a name="l00328"></a>00328 all_columns = all_columns ||
<a name="l00329"></a>00329 quote_literal(feature_columns[index]) ||
<a name="l00330"></a>00330 &#39;,&#39;;
<a name="l00331"></a>00331 index = index+1;
<a name="l00332"></a>00332 END LOOP;
<a name="l00333"></a>00333
<a name="l00334"></a>00334 all_columns = all_columns || quote_literal(id_column) || &#39;,&#39;;
<a name="l00335"></a>00335 all_columns = all_columns || quote_literal(class_column);
<a name="l00336"></a>00336 stmt = stmt ||&#39; AND attname IN (&#39;||all_columns||&#39;) &#39;;
<a name="l00337"></a>00337 END IF;
<a name="l00338"></a>00338
<a name="l00339"></a>00339 stmt = stmt||&#39; LIMIT 1;&#39;;
<a name="l00340"></a>00340
<a name="l00341"></a>00341 EXECUTE stmt INTO rec;
<a name="l00342"></a>00342
<a name="l00343"></a>00343 IF (rec IS NOT NULL) THEN
<a name="l00344"></a>00344 -- Print the first unsupported data type, and supported types.
<a name="l00345"></a>00345 RAISE EXCEPTION &#39;Unsupported data type [%]. Supported types include:
<a name="l00346"></a>00346 SMALLINT, INT, BIGINT, FLOAT8, REAL,
<a name="l00347"></a>00347 DECIMAL, INET, CIDR, MACADDR, BOOLEAN,
<a name="l00348"></a>00348 CHAR, VARCHAR, TEXT, &quot;<span class="keywordtype">char</span>&quot;,
<a name="l00349"></a>00349 DATE, TIME, TIMETZ, TIMESTAMP, TIMESTAMPTZ, and INTERVAL&#39;,
<a name="l00350"></a>00350 rec.atttypid::regtype;
<a name="l00351"></a>00351 END IF;
<a name="l00352"></a>00352
<a name="l00353"></a>00353 SELECT MADLIB_SCHEMA.__format
<a name="l00354"></a>00354 (&#39;SELECT % AS n
<a name="l00355"></a>00355 FROM %
<a name="l00356"></a>00356 GROUP BY %
<a name="l00357"></a>00357 HAVING COUNT(%) &gt; 1
<a name="l00358"></a>00358 LIMIT 1&#39;,
<a name="l00359"></a>00359 ARRAY[
<a name="l00360"></a>00360 id_column,
<a name="l00361"></a>00361 full_table_name,
<a name="l00362"></a>00362 id_column,
<a name="l00363"></a>00363 id_column
<a name="l00364"></a>00364 ]
<a name="l00365"></a>00365 )
<a name="l00366"></a>00366 INTO stmt;
<a name="l00367"></a>00367
<a name="l00368"></a>00368 EXECUTE stmt INTO rec;
<a name="l00369"></a>00369
<a name="l00370"></a>00370 -- check if the <span class="keywordtype">id</span> column has duplicated value
<a name="l00371"></a>00371 PERFORM MADLIB_SCHEMA.__assert
<a name="l00372"></a>00372 (
<a name="l00373"></a>00373 rec IS NULL,
<a name="l00374"></a>00374 &#39;The training table &#39; || full_table_name || &#39; must not have duplicated <span class="keywordtype">id</span>&#39;
<a name="l00375"></a>00375 );
<a name="l00376"></a>00376
<a name="l00377"></a>00377 RETURN;
<a name="l00378"></a>00378 END
<a name="l00379"></a>00379 $$ LANGUAGE PLPGSQL;
<a name="l00380"></a>00380
<a name="l00381"></a>00381
<a name="l00382"></a>00382 <span class="comment">/*</span>
<a name="l00383"></a>00383 <span class="comment"> * @brief Get the class table name by the metatable name.</span>
<a name="l00384"></a>00384 <span class="comment"> *</span>
<a name="l00385"></a>00385 <span class="comment"> * @param meta_tbl_name The full name of the metatable.</span>
<a name="l00386"></a>00386 <span class="comment"> *</span>
<a name="l00387"></a>00387 <span class="comment"> * @return The name of the class table</span>
<a name="l00388"></a>00388 <span class="comment"> *</span>
<a name="l00389"></a>00389 <span class="comment"> */</span>
<a name="l00390"></a>00390 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_classtable_name
<a name="l00391"></a>00391 (
<a name="l00392"></a>00392 meta_tbl_name TEXT
<a name="l00393"></a>00393 )
<a name="l00394"></a>00394 RETURNS TEXT AS $$
<a name="l00395"></a>00395 DECLARE
<a name="l00396"></a>00396 classtable_name TEXT;
<a name="l00397"></a>00397 curstmt TEXT;
<a name="l00398"></a>00398 BEGIN
<a name="l00399"></a>00399
<a name="l00400"></a>00400 PERFORM MADLIB_SCHEMA.__assert_table
<a name="l00401"></a>00401 (
<a name="l00402"></a>00402 meta_tbl_name,
<a name="l00403"></a>00403 &#39;t&#39;
<a name="l00404"></a>00404 );
<a name="l00405"></a>00405
<a name="l00406"></a>00406 curstmt = MADLIB_SCHEMA.__format
<a name="l00407"></a>00407 (
<a name="l00408"></a>00408 &#39;SELECT MADLIB_SCHEMA.__regclass_to_text
<a name="l00409"></a>00409 (table_oid) as table_name
<a name="l00410"></a>00410 FROM %
<a name="l00411"></a>00411 WHERE column_type = &#39;&#39;c&#39;&#39;&#39;,
<a name="l00412"></a>00412 ARRAY[
<a name="l00413"></a>00413 meta_tbl_name
<a name="l00414"></a>00414 ]
<a name="l00415"></a>00415 );
<a name="l00416"></a>00416
<a name="l00417"></a>00417 EXECUTE curstmt INTO classtable_name;
<a name="l00418"></a>00418
<a name="l00419"></a>00419 RETURN classtable_name;
<a name="l00420"></a>00420 END
<a name="l00421"></a>00421 $$ LANGUAGE PLPGSQL;
<a name="l00422"></a>00422
<a name="l00423"></a>00423
<a name="l00424"></a>00424 <span class="comment">/*</span>
<a name="l00425"></a>00425 <span class="comment"> * @brief Drop the metatable and KV tables </span>
<a name="l00426"></a>00426 <span class="comment"> * for the features and the class.</span>
<a name="l00427"></a>00427 <span class="comment"> *</span>
<a name="l00428"></a>00428 <span class="comment"> * @param meta_tbl_name The full name of the metatable.</span>
<a name="l00429"></a>00429 <span class="comment"> *</span>
<a name="l00430"></a>00430 <span class="comment"> */</span>
<a name="l00431"></a>00431 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__drop_metatable
<a name="l00432"></a>00432 (
<a name="l00433"></a>00433 meta_tbl_name TEXT
<a name="l00434"></a>00434 )
<a name="l00435"></a>00435 RETURNS <span class="keywordtype">void</span> AS $$
<a name="l00436"></a>00436 DECLARE
<a name="l00437"></a>00437 curstmt TEXT;
<a name="l00438"></a>00438 name TEXT;
<a name="l00439"></a>00439 BEGIN
<a name="l00440"></a>00440 IF (meta_tbl_name is NULL ) THEN
<a name="l00441"></a>00441 RETURN;
<a name="l00442"></a>00442 END IF;
<a name="l00443"></a>00443
<a name="l00444"></a>00444 PERFORM MADLIB_SCHEMA.__assert_table(meta_tbl_name, &#39;t&#39;);
<a name="l00445"></a>00445
<a name="l00446"></a>00446 -- get the Key-Value tables
<a name="l00447"></a>00447 curstmt = MADLIB_SCHEMA.__format
<a name="l00448"></a>00448 (
<a name="l00449"></a>00449 &#39;SELECT MADLIB_SCHEMA.__regclass_to_text
<a name="l00450"></a>00450 (table_oid) as table_name
<a name="l00451"></a>00451 FROM
<a name="l00452"></a>00452 (
<a name="l00453"></a>00453 SELECT table_oid
<a name="l00454"></a>00454 FROM %
<a name="l00455"></a>00455 WHERE table_oid IS NOT NULL
<a name="l00456"></a>00456 GROUP BY table_oid
<a name="l00457"></a>00457 ) t&#39;,
<a name="l00458"></a>00458 ARRAY[
<a name="l00459"></a>00459 meta_tbl_name
<a name="l00460"></a>00460 ]
<a name="l00461"></a>00461 );
<a name="l00462"></a>00462
<a name="l00463"></a>00463 -- drop all the Key-Value tables
<a name="l00464"></a>00464 FOR name IN EXECUTE curstmt LOOP
<a name="l00465"></a>00465 EXECUTE &#39;DROP TABLE IF EXISTS &#39; || name || &#39; CASCADE;&#39;;
<a name="l00466"></a>00466 END LOOP;
<a name="l00467"></a>00467
<a name="l00468"></a>00468 -- drop the metatable
<a name="l00469"></a>00469 EXECUTE &#39;DROP TABLE &#39; || meta_tbl_name || &#39; CASCADE;&#39;;
<a name="l00470"></a>00470 END
<a name="l00471"></a>00471 $$ LANGUAGE PLPGSQL;
<a name="l00472"></a>00472
<a name="l00473"></a>00473
<a name="l00474"></a>00474 <span class="comment">/*</span>
<a name="l00475"></a>00475 <span class="comment"> * @brief Create the metatable.</span>
<a name="l00476"></a>00476 <span class="comment"> </span>
<a name="l00477"></a>00477 <span class="comment"> * @param meta_tbl_name The full name of the metatable.</span>
<a name="l00478"></a>00478 <span class="comment"> *</span>
<a name="l00479"></a>00479 <span class="comment"> */</span>
<a name="l00480"></a>00480 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__create_metatable
<a name="l00481"></a>00481 (
<a name="l00482"></a>00482 meta_tbl_name TEXT
<a name="l00483"></a>00483 )
<a name="l00484"></a>00484 RETURNS <span class="keywordtype">void</span> AS $$
<a name="l00485"></a>00485 DECLARE
<a name="l00486"></a>00486 curstmt TEXT;
<a name="l00487"></a>00487 result INT := 0;
<a name="l00488"></a>00488 BEGIN
<a name="l00489"></a>00489 -- the maximum length of an identifier is 63
<a name="l00490"></a>00490 PERFORM MADLIB_SCHEMA.__assert
<a name="l00491"></a>00491 (
<a name="l00492"></a>00492 length(MADLIB_SCHEMA.__strip_schema_name(meta_tbl_name)) &lt;= 63,
<a name="l00493"></a>00493 &#39;The maximum length of &#39; ||
<a name="l00494"></a>00494 MADLIB_SCHEMA.__strip_schema_name(meta_tbl_name) ||
<a name="l00495"></a>00495 &#39; is 63&#39;
<a name="l00496"></a>00496 );
<a name="l00497"></a>00497
<a name="l00498"></a>00498 -- must not be existence
<a name="l00499"></a>00499 PERFORM MADLIB_SCHEMA.__assert_table(meta_tbl_name, &#39;f&#39;);
<a name="l00500"></a>00500
<a name="l00501"></a>00501 -- &#39;f&#39; for feature, &#39;c&#39; for class, &#39;i&#39; for <span class="keywordtype">id</span>
<a name="l00502"></a>00502 -- &#39;t&#39; for continuous value, &#39;f&#39; for discrete value
<a name="l00503"></a>00503 curstmt = MADLIB_SCHEMA.__format
<a name="l00504"></a>00504 (
<a name="l00505"></a>00505 &#39;CREATE TABLE %(
<a name="l00506"></a>00506 <span class="keywordtype">id</span> INT,
<a name="l00507"></a>00507 column_name TEXT,
<a name="l00508"></a>00508 column_type TEXT,
<a name="l00509"></a>00509 is_cont BOOL,
<a name="l00510"></a>00510 table_oid OID,
<a name="l00511"></a>00511 num_dist_value INT
<a name="l00512"></a>00512 ) m4_ifdef(`__GREENPLUM__&#39;, `DISTRIBUTED BY (<span class="keywordtype">id</span>)&#39;)&#39;,
<a name="l00513"></a>00513 meta_tbl_name
<a name="l00514"></a>00514 );
<a name="l00515"></a>00515 EXECUTE curstmt;
<a name="l00516"></a>00516 END
<a name="l00517"></a>00517 $$ LANGUAGE PLPGSQL;
<a name="l00518"></a>00518
<a name="l00519"></a>00519
<a name="l00520"></a>00520 <span class="comment">/*</span>
<a name="l00521"></a>00521 <span class="comment"> * @brief Insert a record to the metatable</span>
<a name="l00522"></a>00522 <span class="comment"> * A row in the metatable represents a column&#39;s information.</span>
<a name="l00523"></a>00523 <span class="comment"> *</span>
<a name="l00524"></a>00524 <span class="comment"> * @param meta_tbl_name The full name of the metatable.</span>
<a name="l00525"></a>00525 <span class="comment"> * @param column_name The name of the column.</span>
<a name="l00526"></a>00526 <span class="comment"> * @param column_type The type of the column.</span>
<a name="l00527"></a>00527 <span class="comment"> * &#39;i&#39; means id, &#39;c&#39; means class, &#39;f&#39; means feature.</span>
<a name="l00528"></a>00528 <span class="comment"> * @param is_cont True if the column is continuous.</span>
<a name="l00529"></a>00529 <span class="comment"> * @param table_name The full name of key-value table for the column.</span>
<a name="l00530"></a>00530 <span class="comment"> * The OID of this table will be stored.</span>
<a name="l00531"></a>00531 <span class="comment"> * @param num_dist_value The number of distinct values for the column.</span>
<a name="l00532"></a>00532 <span class="comment"> *</span>
<a name="l00533"></a>00533 <span class="comment"> * @note The null value will be included in the distinct values.</span>
<a name="l00534"></a>00534 <span class="comment"> *</span>
<a name="l00535"></a>00535 <span class="comment"> */</span>
<a name="l00536"></a>00536 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__insert_into_metatable
<a name="l00537"></a>00537 (
<a name="l00538"></a>00538 meta_tbl_name TEXT,
<a name="l00539"></a>00539 col_index INT,
<a name="l00540"></a>00540 column_name TEXT,
<a name="l00541"></a>00541 column_type CHAR,
<a name="l00542"></a>00542 is_cont BOOLEAN,
<a name="l00543"></a>00543 table_name TEXT,
<a name="l00544"></a>00544 num_dist_value INT
<a name="l00545"></a>00545 )
<a name="l00546"></a>00546 RETURNS <span class="keywordtype">void</span> AS $$
<a name="l00547"></a>00547 DECLARE
<a name="l00548"></a>00548 curstmt TEXT := &#39;&#39;;
<a name="l00549"></a>00549 tbl_txt TEXT := &#39;NULL&#39;;
<a name="l00550"></a>00550 BEGIN
<a name="l00551"></a>00551 PERFORM MADLIB_SCHEMA.__assert
<a name="l00552"></a>00552 (
<a name="l00553"></a>00553 column_type = &#39;f&#39; OR column_type = &#39;i&#39; OR column_type = &#39;c&#39;,
<a name="l00554"></a>00554 &#39;column type must be &#39;&#39;f&#39;&#39;, &#39;&#39;i&#39;&#39; or &#39;&#39;c&#39;&#39;&#39;
<a name="l00555"></a>00555 );
<a name="l00556"></a>00556 IF (table_name IS NOT NULL) THEN
<a name="l00557"></a>00557 tbl_txt = &#39;&#39;&#39;&#39; || table_name || &#39;&#39;&#39;&#39;;
<a name="l00558"></a>00558 END IF;
<a name="l00559"></a>00559
<a name="l00560"></a>00560 curstmt = MADLIB_SCHEMA.__format
<a name="l00561"></a>00561 (
<a name="l00562"></a>00562 &#39;INSERT INTO % VALUES
<a name="l00563"></a>00563 (%, &#39;&#39;%&#39;&#39;, &#39;&#39;%&#39;&#39;, &#39;&#39;%&#39;&#39;, %::regclass, %);&#39;,
<a name="l00564"></a>00564 ARRAY[
<a name="l00565"></a>00565 meta_tbl_name,
<a name="l00566"></a>00566 col_index::TEXT,
<a name="l00567"></a>00567 column_name,
<a name="l00568"></a>00568 column_type,
<a name="l00569"></a>00569 MADLIB_SCHEMA.__to_char(is_cont),
<a name="l00570"></a>00570 tbl_txt,
<a name="l00571"></a>00571 num_dist_value::TEXT
<a name="l00572"></a>00572 ]
<a name="l00573"></a>00573 );
<a name="l00574"></a>00574
<a name="l00575"></a>00575 EXECUTE curstmt;
<a name="l00576"></a>00576 END
<a name="l00577"></a>00577 $$ LANGUAGE PLPGSQL;
<a name="l00578"></a>00578
<a name="l00579"></a>00579
<a name="l00580"></a>00580 <span class="comment">/*</span>
<a name="l00581"></a>00581 <span class="comment"> * @brief Validate if the metatable exists or not.</span>
<a name="l00582"></a>00582 <span class="comment"> * Validate if the tables in &quot;table_oid&quot; column exists or not.</span>
<a name="l00583"></a>00583 <span class="comment"> *</span>
<a name="l00584"></a>00584 <span class="comment"> * @param meta_tbl_name The full name of the metatable.</span>
<a name="l00585"></a>00585 <span class="comment"> *</span>
<a name="l00586"></a>00586 <span class="comment"> */</span>
<a name="l00587"></a>00587 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__validate_metatable
<a name="l00588"></a>00588 (
<a name="l00589"></a>00589 meta_tbl_name TEXT
<a name="l00590"></a>00590 )
<a name="l00591"></a>00591 RETURNS VOID AS $$
<a name="l00592"></a>00592 DECLARE
<a name="l00593"></a>00593 curstmt TEXT;
<a name="l00594"></a>00594 name TEXT;
<a name="l00595"></a>00595 BEGIN
<a name="l00596"></a>00596 PERFORM MADLIB_SCHEMA.__assert_table(meta_tbl_name, &#39;t&#39;);
<a name="l00597"></a>00597
<a name="l00598"></a>00598 -- if one of those KV tables doesn&#39;t exist,
<a name="l00599"></a>00599 -- we raise exception.
<a name="l00600"></a>00600 curstmt = MADLIB_SCHEMA.__format
<a name="l00601"></a>00601 (
<a name="l00602"></a>00602 &#39;SELECT MADLIB_SCHEMA.__assert_table
<a name="l00603"></a>00603 (MADLIB_SCHEMA.__regclass_to_text(table_oid), &#39;&#39;t&#39;&#39;)
<a name="l00604"></a>00604 FROM
<a name="l00605"></a>00605 (
<a name="l00606"></a>00606 SELECT table_oid
<a name="l00607"></a>00607 FROM %
<a name="l00608"></a>00608 WHERE table_oid IS NOT NULL
<a name="l00609"></a>00609 GROUP BY table_oid
<a name="l00610"></a>00610 ) t&#39;,
<a name="l00611"></a>00611 ARRAY[
<a name="l00612"></a>00612 meta_tbl_name
<a name="l00613"></a>00613 ]
<a name="l00614"></a>00614 );
<a name="l00615"></a>00615 EXECUTE curstmt;
<a name="l00616"></a>00616 END
<a name="l00617"></a>00617 $$ LANGUAGE PLPGSQL;
<a name="l00618"></a>00618
<a name="l00619"></a>00619 <span class="comment">/*</span>
<a name="l00620"></a>00620 <span class="comment"> * @brief Get the number of distinct values for the feature with given ID.</span>
<a name="l00621"></a>00621 <span class="comment"> *</span>
<a name="l00622"></a>00622 <span class="comment"> * @param meta_tbl_name The full name of the metatable.</span>
<a name="l00623"></a>00623 <span class="comment"> * @param feature_id The ID of the feature in the metatable.</span>
<a name="l00624"></a>00624 <span class="comment"> *</span>
<a name="l00625"></a>00625 <span class="comment"> * @return The number of the distinct values for the given feature.</span>
<a name="l00626"></a>00626 <span class="comment"> *</span>
<a name="l00627"></a>00627 <span class="comment"> */</span>
<a name="l00628"></a>00628 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__distinct_feature_value
<a name="l00629"></a>00629 (
<a name="l00630"></a>00630 meta_tbl_name TEXT,
<a name="l00631"></a>00631 feature_id INT
<a name="l00632"></a>00632 )
<a name="l00633"></a>00633 RETURNS INT4 AS $$
<a name="l00634"></a>00634 DECLARE
<a name="l00635"></a>00635 curstmt TEXT := &#39;&#39;;
<a name="l00636"></a>00636 result INT4 := 0;
<a name="l00637"></a>00637 BEGIN
<a name="l00638"></a>00638 curstmt = MADLIB_SCHEMA.__format
<a name="l00639"></a>00639 (
<a name="l00640"></a>00640 &#39;SELECT num_dist_value
<a name="l00641"></a>00641 FROM %
<a name="l00642"></a>00642 WHERE column_type=&#39;&#39;f&#39;&#39; AND <span class="keywordtype">id</span> = %&#39;,
<a name="l00643"></a>00643 meta_tbl_name,
<a name="l00644"></a>00644 feature_id::TEXT
<a name="l00645"></a>00645 );
<a name="l00646"></a>00646
<a name="l00647"></a>00647 EXECUTE curstmt INTO result;
<a name="l00648"></a>00648
<a name="l00649"></a>00649 RETURN result;
<a name="l00650"></a>00650 END
<a name="l00651"></a>00651 $$ LANGUAGE PLPGSQL;
<a name="l00652"></a>00652
<a name="l00653"></a>00653
<a name="l00654"></a>00654 <span class="comment">/*</span>
<a name="l00655"></a>00655 <span class="comment"> * @brief Get the number of features.</span>
<a name="l00656"></a>00656 <span class="comment"> *</span>
<a name="l00657"></a>00657 <span class="comment"> * @param meta_tbl_name The full name of the metatable.</span>
<a name="l00658"></a>00658 <span class="comment"> *</span>
<a name="l00659"></a>00659 <span class="comment"> * @return The number of features in the training table.</span>
<a name="l00660"></a>00660 <span class="comment"> *</span>
<a name="l00661"></a>00661 <span class="comment"> */</span>
<a name="l00662"></a>00662 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__num_of_feature
<a name="l00663"></a>00663 (
<a name="l00664"></a>00664 meta_tbl_name TEXT
<a name="l00665"></a>00665 )
<a name="l00666"></a>00666 RETURNS INT4 AS $$
<a name="l00667"></a>00667 DECLARE
<a name="l00668"></a>00668 curstmt TEXT := &#39;&#39;;
<a name="l00669"></a>00669 result INT4 := 0;
<a name="l00670"></a>00670 BEGIN
<a name="l00671"></a>00671 curstmt = MADLIB_SCHEMA.__format
<a name="l00672"></a>00672 (
<a name="l00673"></a>00673 &#39;SELECT COUNT(*)
<a name="l00674"></a>00674 FROM %
<a name="l00675"></a>00675 WHERE column_type=&#39;&#39;f&#39;&#39;&#39;,
<a name="l00676"></a>00676 meta_tbl_name
<a name="l00677"></a>00677 );
<a name="l00678"></a>00678
<a name="l00679"></a>00679 EXECUTE curstmt INTO result;
<a name="l00680"></a>00680
<a name="l00681"></a>00681 RETURN result;
<a name="l00682"></a>00682 END
<a name="l00683"></a>00683 $$ LANGUAGE PLPGSQL;
<a name="l00684"></a>00684
<a name="l00685"></a>00685
<a name="l00686"></a>00686 <span class="comment">/*</span>
<a name="l00687"></a>00687 <span class="comment"> * @brief Get the number of distinct class values.</span>
<a name="l00688"></a>00688 <span class="comment"> *</span>
<a name="l00689"></a>00689 <span class="comment"> * @param meta_tbl_name The full name of the metatable.</span>
<a name="l00690"></a>00690 <span class="comment"> *</span>
<a name="l00691"></a>00691 <span class="comment"> * @return The number of class labels in the training table.</span>
<a name="l00692"></a>00692 <span class="comment"> *</span>
<a name="l00693"></a>00693 <span class="comment"> */</span>
<a name="l00694"></a>00694 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__num_of_class
<a name="l00695"></a>00695 (
<a name="l00696"></a>00696 meta_tbl_name TEXT
<a name="l00697"></a>00697 )
<a name="l00698"></a>00698 RETURNS INT4 AS $$
<a name="l00699"></a>00699 DECLARE
<a name="l00700"></a>00700 curstmt TEXT := &#39;&#39;;
<a name="l00701"></a>00701 result INT4 := 0;
<a name="l00702"></a>00702 class_table_name TEXT := &#39;&#39;;
<a name="l00703"></a>00703 BEGIN
<a name="l00704"></a>00704 curstmt = MADLIB_SCHEMA.__format
<a name="l00705"></a>00705 (
<a name="l00706"></a>00706 &#39;SELECT MADLIB_SCHEMA.__regclass_to_text(table_oid)
<a name="l00707"></a>00707 FROM %
<a name="l00708"></a>00708 WHERE column_type=&#39;&#39;c&#39;&#39;&#39;,
<a name="l00709"></a>00709 meta_tbl_name
<a name="l00710"></a>00710 );
<a name="l00711"></a>00711
<a name="l00712"></a>00712 EXECUTE curstmt INTO class_table_name;
<a name="l00713"></a>00713
<a name="l00714"></a>00714 curstmt = MADLIB_SCHEMA.__format
<a name="l00715"></a>00715 (
<a name="l00716"></a>00716 &#39;SELECT COUNT(code)
<a name="l00717"></a>00717 FROM %&#39;,
<a name="l00718"></a>00718 class_table_name
<a name="l00719"></a>00719 );
<a name="l00720"></a>00720
<a name="l00721"></a>00721 EXECUTE curstmt INTO result;
<a name="l00722"></a>00722
<a name="l00723"></a>00723 RETURN result;
<a name="l00724"></a>00724 END
<a name="l00725"></a>00725 $$ LANGUAGE PLPGSQL;
<a name="l00726"></a>00726
<a name="l00727"></a>00727
<a name="l00728"></a>00728 <span class="comment">/*</span>
<a name="l00729"></a>00729 <span class="comment"> * @brief Get the feature name by the specified feature ID.</span>
<a name="l00730"></a>00730 <span class="comment"> *</span>
<a name="l00731"></a>00731 <span class="comment"> * @param feature_index The ID of the feature.</span>
<a name="l00732"></a>00732 <span class="comment"> * @param meta_tbl_name The full name of the metatable.</span>
<a name="l00733"></a>00733 <span class="comment"> *</span>
<a name="l00734"></a>00734 <span class="comment"> * @return The feature name.</span>
<a name="l00735"></a>00735 <span class="comment"> *</span>
<a name="l00736"></a>00736 <span class="comment"> */</span>
<a name="l00737"></a>00737 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_feature_name
<a name="l00738"></a>00738 (
<a name="l00739"></a>00739 feature_index INT,
<a name="l00740"></a>00740 meta_tbl_name TEXT
<a name="l00741"></a>00741 )
<a name="l00742"></a>00742 RETURNS TEXT AS $$
<a name="l00743"></a>00743 DECLARE
<a name="l00744"></a>00744 curstmt TEXT;
<a name="l00745"></a>00745 result TEXT := &#39;&#39;;
<a name="l00746"></a>00746 BEGIN
<a name="l00747"></a>00747 curstmt = MADLIB_SCHEMA.__format
<a name="l00748"></a>00748 (
<a name="l00749"></a>00749 &#39;SELECT column_name
<a name="l00750"></a>00750 FROM %
<a name="l00751"></a>00751 WHERE <span class="keywordtype">id</span> = % AND column_type = &#39;&#39;f&#39;&#39;;&#39;,
<a name="l00752"></a>00752 meta_tbl_name,
<a name="l00753"></a>00753 MADLIB_SCHEMA.__to_char(feature_index)
<a name="l00754"></a>00754 );
<a name="l00755"></a>00755
<a name="l00756"></a>00756 EXECUTE curstmt INTO result;
<a name="l00757"></a>00757
<a name="l00758"></a>00758 RETURN result;
<a name="l00759"></a>00759 END
<a name="l00760"></a>00760 $$ LANGUAGE PLPGSQL;
<a name="l00761"></a>00761
<a name="l00762"></a>00762
<a name="l00763"></a>00763 <span class="comment">/*</span>
<a name="l00764"></a>00764 <span class="comment"> * @brief Get the column value by the specified column ID and code.</span>
<a name="l00765"></a>00765 <span class="comment"> *</span>
<a name="l00766"></a>00766 <span class="comment"> * @param column_index The ID of the column.</span>
<a name="l00767"></a>00767 <span class="comment"> * @param code The code of the column value.</span>
<a name="l00768"></a>00768 <span class="comment"> * @param column_type The type of the column.</span>
<a name="l00769"></a>00769 <span class="comment"> * &#39;i&#39; means id, &#39;c&#39; means class, &#39;f&#39; means feature.</span>
<a name="l00770"></a>00770 <span class="comment"> * @param meta_tbl_name The full name of the metatable.</span>
<a name="l00771"></a>00771 <span class="comment"> * </span>
<a name="l00772"></a>00772 <span class="comment"> * @return The column&#39;s value corresponding to the give code. </span>
<a name="l00773"></a>00773 <span class="comment"> *</span>
<a name="l00774"></a>00774 <span class="comment"> */</span>
<a name="l00775"></a>00775 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_column_value
<a name="l00776"></a>00776 (
<a name="l00777"></a>00777 column_index INT,
<a name="l00778"></a>00778 code INT,
<a name="l00779"></a>00779 column_type CHAR,
<a name="l00780"></a>00780 meta_tbl_name TEXT
<a name="l00781"></a>00781 )
<a name="l00782"></a>00782 RETURNS TEXT AS $$
<a name="l00783"></a>00783 DECLARE
<a name="l00784"></a>00784 curstmt TEXT;
<a name="l00785"></a>00785 names TEXT[];
<a name="l00786"></a>00786 result TEXT := &#39;&#39;;
<a name="l00787"></a>00787 tmp_txt TEXT := &#39; WHERE column_type = &#39;&#39;c&#39;&#39;&#39;;
<a name="l00788"></a>00788 BEGIN
<a name="l00789"></a>00789 PERFORM MADLIB_SCHEMA.__assert
<a name="l00790"></a>00790 (
<a name="l00791"></a>00791 code IS NOT NULL,
<a name="l00792"></a>00792 &#39;the code of the value should not be null&#39;
<a name="l00793"></a>00793 );
<a name="l00794"></a>00794
<a name="l00795"></a>00795 IF (column_type &lt;&gt; &#39;c&#39;) THEN
<a name="l00796"></a>00796 tmp_txt = MADLIB_SCHEMA.__format
<a name="l00797"></a>00797 (
<a name="l00798"></a>00798 &#39; WHERE <span class="keywordtype">id</span> = % AND column_type = &#39;&#39;%&#39;&#39;&#39;,
<a name="l00799"></a>00799 column_index::TEXT,
<a name="l00800"></a>00800 column_type::TEXT
<a name="l00801"></a>00801 );
<a name="l00802"></a>00802 END IF;
<a name="l00803"></a>00803
<a name="l00804"></a>00804 curstmt = MADLIB_SCHEMA.__format
<a name="l00805"></a>00805 (
<a name="l00806"></a>00806 &#39;SELECT
<a name="l00807"></a>00807 ARRAY[column_name,
<a name="l00808"></a>00808 MADLIB_SCHEMA.__regclass_to_text(table_oid)]
<a name="l00809"></a>00809 FROM %
<a name="l00810"></a>00810 %&#39;,
<a name="l00811"></a>00811 meta_tbl_name,
<a name="l00812"></a>00812 tmp_txt
<a name="l00813"></a>00813 );
<a name="l00814"></a>00814
<a name="l00815"></a>00815 EXECUTE curstmt INTO names;
<a name="l00816"></a>00816
<a name="l00817"></a>00817 PERFORM MADLIB_SCHEMA.__assert(names[1] IS NOT NULL, &#39;No such column name&#39;);
<a name="l00818"></a>00818 PERFORM MADLIB_SCHEMA.__assert(names[2] IS NOT NULL, &#39;No such table name&#39;);
<a name="l00819"></a>00819
<a name="l00820"></a>00820 curstmt = MADLIB_SCHEMA.__format
<a name="l00821"></a>00821 (
<a name="l00822"></a>00822 &#39;SELECT MADLIB_SCHEMA.__to_char(fval)
<a name="l00823"></a>00823 FROM %
<a name="l00824"></a>00824 WHERE code = %;&#39;,
<a name="l00825"></a>00825 names[2],
<a name="l00826"></a>00826 code::TEXT
<a name="l00827"></a>00827 );
<a name="l00828"></a>00828
<a name="l00829"></a>00829 EXECUTE curstmt INTO result;
<a name="l00830"></a>00830
<a name="l00831"></a>00831 IF (result IS NULL) THEN
<a name="l00832"></a>00832 result = &#39;NULL&#39;;
<a name="l00833"></a>00833 END IF;
<a name="l00834"></a>00834
<a name="l00835"></a>00835 RETURN result;
<a name="l00836"></a>00836 END
<a name="l00837"></a>00837 $$ LANGUAGE PLPGSQL;
<a name="l00838"></a>00838
<a name="l00839"></a>00839
<a name="l00840"></a>00840 <span class="comment">/*</span>
<a name="l00841"></a>00841 <span class="comment"> * @brief Get the feature value by the specified feature ID and code.</span>
<a name="l00842"></a>00842 <span class="comment"> *</span>
<a name="l00843"></a>00843 <span class="comment"> * @param feature_index The ID of the feature.</span>
<a name="l00844"></a>00844 <span class="comment"> * @param code The code of the feature value.</span>
<a name="l00845"></a>00845 <span class="comment"> * @param meta_tbl_name The full name of the metatable. </span>
<a name="l00846"></a>00846 <span class="comment"> *</span>
<a name="l00847"></a>00847 <span class="comment"> * @return The value of specified code of the feature </span>
<a name="l00848"></a>00848 <span class="comment"> * whose id specified in feature_index.</span>
<a name="l00849"></a>00849 <span class="comment"> *</span>
<a name="l00850"></a>00850 <span class="comment"> */</span>
<a name="l00851"></a>00851 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_feature_value
<a name="l00852"></a>00852 (
<a name="l00853"></a>00853 feature_index INT,
<a name="l00854"></a>00854 code INT,
<a name="l00855"></a>00855 meta_tbl_name TEXT
<a name="l00856"></a>00856 )
<a name="l00857"></a>00857 RETURNS TEXT AS $$
<a name="l00858"></a>00858 DECLARE
<a name="l00859"></a>00859 result TEXT := &#39;&#39;;
<a name="l00860"></a>00860 BEGIN
<a name="l00861"></a>00861 result = MADLIB_SCHEMA.__get_column_value
<a name="l00862"></a>00862 (
<a name="l00863"></a>00863 feature_index,
<a name="l00864"></a>00864 code,
<a name="l00865"></a>00865 &#39;f&#39;,
<a name="l00866"></a>00866 meta_tbl_name
<a name="l00867"></a>00867 );
<a name="l00868"></a>00868
<a name="l00869"></a>00869 RETURN result;
<a name="l00870"></a>00870 END
<a name="l00871"></a>00871 $$ LANGUAGE PLPGSQL;
<a name="l00872"></a>00872
<a name="l00873"></a>00873
<a name="l00874"></a>00874 <span class="comment">/*</span>
<a name="l00875"></a>00875 <span class="comment"> * @brief Get the ID column name.</span>
<a name="l00876"></a>00876 <span class="comment"> *</span>
<a name="l00877"></a>00877 <span class="comment"> * @param meta_tbl_name The full name of the metatable.</span>
<a name="l00878"></a>00878 <span class="comment"> *</span>
<a name="l00879"></a>00879 <span class="comment"> * @return The ID column name.</span>
<a name="l00880"></a>00880 <span class="comment"> *</span>
<a name="l00881"></a>00881 <span class="comment"> */</span>
<a name="l00882"></a>00882 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_id_column_name
<a name="l00883"></a>00883 (
<a name="l00884"></a>00884 meta_tbl_name TEXT
<a name="l00885"></a>00885 )
<a name="l00886"></a>00886 RETURNS TEXT AS $$
<a name="l00887"></a>00887 DECLARE
<a name="l00888"></a>00888 curstmt TEXT;
<a name="l00889"></a>00889 result TEXT := &#39;&#39;;
<a name="l00890"></a>00890 BEGIN
<a name="l00891"></a>00891 PERFORM MADLIB_SCHEMA.__assert_table
<a name="l00892"></a>00892 (
<a name="l00893"></a>00893 meta_tbl_name,
<a name="l00894"></a>00894 &#39;t&#39;
<a name="l00895"></a>00895 );
<a name="l00896"></a>00896
<a name="l00897"></a>00897 curstmt = MADLIB_SCHEMA.__format
<a name="l00898"></a>00898 (
<a name="l00899"></a>00899 &#39;SELECT column_name
<a name="l00900"></a>00900 FROM %
<a name="l00901"></a>00901 WHERE column_type = &#39;&#39;i&#39;&#39;
<a name="l00902"></a>00902 LIMIT 1&#39;,
<a name="l00903"></a>00903 meta_tbl_name
<a name="l00904"></a>00904 );
<a name="l00905"></a>00905
<a name="l00906"></a>00906 EXECUTE curstmt INTO result;
<a name="l00907"></a>00907
<a name="l00908"></a>00908 RETURN result;
<a name="l00909"></a>00909 END
<a name="l00910"></a>00910 $$ LANGUAGE PLPGSQL;
<a name="l00911"></a>00911
<a name="l00912"></a>00912
<a name="l00913"></a>00913 <span class="comment">/*</span>
<a name="l00914"></a>00914 <span class="comment"> * @brief Get the class column name.</span>
<a name="l00915"></a>00915 <span class="comment"> *</span>
<a name="l00916"></a>00916 <span class="comment"> * @param meta_tbl_name The full name of the metatable.</span>
<a name="l00917"></a>00917 <span class="comment"> *</span>
<a name="l00918"></a>00918 <span class="comment"> * @return The class column name. </span>
<a name="l00919"></a>00919 <span class="comment"> *</span>
<a name="l00920"></a>00920 <span class="comment"> */</span>
<a name="l00921"></a>00921 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_class_column_name
<a name="l00922"></a>00922 (
<a name="l00923"></a>00923 meta_tbl_name TEXT
<a name="l00924"></a>00924 )
<a name="l00925"></a>00925 RETURNS TEXT AS $$
<a name="l00926"></a>00926 DECLARE
<a name="l00927"></a>00927 curstmt TEXT;
<a name="l00928"></a>00928 result TEXT := &#39;&#39;;
<a name="l00929"></a>00929 BEGIN
<a name="l00930"></a>00930 PERFORM MADLIB_SCHEMA.__assert_table
<a name="l00931"></a>00931 (
<a name="l00932"></a>00932 meta_tbl_name,
<a name="l00933"></a>00933 &#39;t&#39;
<a name="l00934"></a>00934 );
<a name="l00935"></a>00935
<a name="l00936"></a>00936 curstmt = MADLIB_SCHEMA.__format
<a name="l00937"></a>00937 (
<a name="l00938"></a>00938 &#39;SELECT column_name
<a name="l00939"></a>00939 FROM %
<a name="l00940"></a>00940 WHERE column_type = &#39;&#39;c&#39;&#39; LIMIT 1&#39;,
<a name="l00941"></a>00941 meta_tbl_name
<a name="l00942"></a>00942 );
<a name="l00943"></a>00943
<a name="l00944"></a>00944 EXECUTE curstmt INTO result;
<a name="l00945"></a>00945
<a name="l00946"></a>00946 RETURN result;
<a name="l00947"></a>00947 END
<a name="l00948"></a>00948 $$ LANGUAGE PLPGSQL;
<a name="l00949"></a>00949
<a name="l00950"></a>00950
<a name="l00951"></a>00951 <span class="comment">/*</span>
<a name="l00952"></a>00952 <span class="comment"> * @brief Get the class value by the specified code.</span>
<a name="l00953"></a>00953 <span class="comment"> *</span>
<a name="l00954"></a>00954 <span class="comment"> * @param code The code of the class value.</span>
<a name="l00955"></a>00955 <span class="comment"> * @param meta_tbl_name The full name of the metatable.</span>
<a name="l00956"></a>00956 <span class="comment"> *</span>
<a name="l00957"></a>00957 <span class="comment"> * @return The class value corresponding to the code. </span>
<a name="l00958"></a>00958 <span class="comment"> *</span>
<a name="l00959"></a>00959 <span class="comment"> */</span>
<a name="l00960"></a>00960 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_class_value
<a name="l00961"></a>00961 (
<a name="l00962"></a>00962 code INT,
<a name="l00963"></a>00963 meta_tbl_name TEXT
<a name="l00964"></a>00964 )
<a name="l00965"></a>00965 RETURNS TEXT AS $$
<a name="l00966"></a>00966 DECLARE
<a name="l00967"></a>00967 result TEXT := &#39;&#39;;
<a name="l00968"></a>00968 BEGIN
<a name="l00969"></a>00969 result = MADLIB_SCHEMA.__get_column_value(0, code, &#39;c&#39;, meta_tbl_name);
<a name="l00970"></a>00970
<a name="l00971"></a>00971 RETURN result;
<a name="l00972"></a>00972 END
<a name="l00973"></a>00973 $$ LANGUAGE PLPGSQL;
<a name="l00974"></a>00974
<a name="l00975"></a>00975
<a name="l00976"></a>00976 <span class="comment">/*</span>
<a name="l00977"></a>00977 <span class="comment"> * @brief breakup each record from the training table.</span>
<a name="l00978"></a>00978 <span class="comment"> * For example, we have the training table t(id, f1, f2, f3, class), </span>
<a name="l00979"></a>00979 <span class="comment"> * then the breakup table is bt(id, fid, fval, is_cont, class). </span>
<a name="l00980"></a>00980 <span class="comment"> * The id column of the two tables is the same. Each feature will be</span>
<a name="l00981"></a>00981 <span class="comment"> * encoded to continuous numeric number. Assume that t has values</span>
<a name="l00982"></a>00982 <span class="comment"> * (1, &#39;a&#39;, 1, 10, &#39;+&#39;) </span>
<a name="l00983"></a>00983 <span class="comment"> * (2, &#39;b&#39;, 2, 8, &#39;-&#39;)</span>
<a name="l00984"></a>00984 <span class="comment"> * (3, &#39;d&#39;, null, 2, &#39;+&#39;) </span>
<a name="l00985"></a>00985 <span class="comment"> * and all of them are discrete features, then the values of bt are </span>
<a name="l00986"></a>00986 <span class="comment"> * (1, 1, &#39;a&#39;, &#39;f&#39;, &#39;+&#39;) </span>
<a name="l00987"></a>00987 <span class="comment"> * (2, 1, &#39;b&#39;, &#39;f&#39;, &#39;-&#39;) </span>
<a name="l00988"></a>00988 <span class="comment"> * (3, 1, &#39;d&#39;, &#39;f&#39;, &#39;+&#39;)</span>
<a name="l00989"></a>00989 <span class="comment"> * (1, 2, 1, &#39;f&#39;, &#39;+&#39;) </span>
<a name="l00990"></a>00990 <span class="comment"> * (2, 2, 2, &#39;f&#39;, &#39;-&#39;) </span>
<a name="l00991"></a>00991 <span class="comment"> * (3, 2, null, &#39;f&#39;, &#39;+&#39;)</span>
<a name="l00992"></a>00992 <span class="comment"> * (1, 3, 10, &#39;f&#39;, &#39;+&#39;) </span>
<a name="l00993"></a>00993 <span class="comment"> * (2, 3, 8, &#39;f&#39;, &#39;-&#39;) </span>
<a name="l00994"></a>00994 <span class="comment"> * (3, 3, 2, &#39;f&#39;, &#39;+&#39;)</span>
<a name="l00995"></a>00995 <span class="comment"> *</span>
<a name="l00996"></a>00996 <span class="comment"> * @param input_tbl_name The full name of the input training table.</span>
<a name="l00997"></a>00997 <span class="comment"> * @param breakup_tbl_name The name of the breakup table.</span>
<a name="l00998"></a>00998 <span class="comment"> * @param kv_cls_name The name of the key-value table for class column.</span>
<a name="l00999"></a>00999 <span class="comment"> * @param id_col_name The name of the ID column. </span>
<a name="l01000"></a>01000 <span class="comment"> * @param attr_col_names The array contains all the features&#39; names.</span>
<a name="l01001"></a>01001 <span class="comment"> * @param is_conts The subscript of the array denotes the feature index.</span>
<a name="l01002"></a>01002 <span class="comment"> * Each value of the array denotes the feature is</span>
<a name="l01003"></a>01003 <span class="comment"> * continuous (&#39;t&#39;) or discrete (&#39;f&#39;)</span>
<a name="l01004"></a>01004 <span class="comment"> * @param verbosity &gt; 0 means this function runs in verbose mode. </span>
<a name="l01005"></a>01005 <span class="comment"> *</span>
<a name="l01006"></a>01006 <span class="comment"> * @return The name of the breakup table, which will be used to generate the encoded</span>
<a name="l01007"></a>01007 <span class="comment"> * table.</span>
<a name="l01008"></a>01008 <span class="comment"> *</span>
<a name="l01009"></a>01009 <span class="comment"> */</span>
<a name="l01010"></a>01010 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__breakup_table
<a name="l01011"></a>01011 (
<a name="l01012"></a>01012 input_tbl_name TEXT,
<a name="l01013"></a>01013 breakup_tbl_name TEXT,
<a name="l01014"></a>01014 kv_cls_name TEXT,
<a name="l01015"></a>01015 id_col_name TEXT,
<a name="l01016"></a>01016 cls_col_name TEXT,
<a name="l01017"></a>01017 attr_col_names TEXT[],
<a name="l01018"></a>01018 is_conts BOOL[],
<a name="l01019"></a>01019 h2hmv_routine_id INT,
<a name="l01020"></a>01020 verbosity INT
<a name="l01021"></a>01021 )
<a name="l01022"></a>01022 RETURNS VOID AS $$
<a name="l01023"></a>01023 DECLARE
<a name="l01024"></a>01024 curstmt TEXT;
<a name="l01025"></a>01025 exec_begin TIMESTAMP;
<a name="l01026"></a>01026 where_txt TEXT := &#39;&#39;;
<a name="l01027"></a>01027 fval_txt TEXT := &#39;fval&#39;;
<a name="l01028"></a>01028 BEGIN
<a name="l01029"></a>01029 exec_begin = clock_timestamp();
<a name="l01030"></a>01030
<a name="l01031"></a>01031 EXECUTE &#39;DROP TABLE IF EXISTS &#39; || breakup_tbl_name;
<a name="l01032"></a>01032
<a name="l01033"></a>01033 m4_changequote(`&gt;&gt;&gt;&#39;, `&lt;&lt;&lt;&#39;)
<a name="l01034"></a>01034 m4_ifdef(&gt;&gt;&gt;__GREENPLUM_GE_4_2_1__&lt;&lt;&lt;, &gt;&gt;&gt;
<a name="l01035"></a>01035 -- if the DB is GPDB and its <a class="code" href="utilities_8sql__in.html#a9ce7ffa76acc8a0638e9bc48be99f36a" title="Return MADlib build information.">version</a> is greater than or equal
<a name="l01036"></a>01036 -- to 4.2, then we will use RLE compression for the encoded table.
<a name="l01037"></a>01037 curstmt = MADLIB_SCHEMA.__format
<a name="l01038"></a>01038 (
<a name="l01039"></a>01039 &#39;CREATE TEMP TABLE %
<a name="l01040"></a>01040 (
<a name="l01041"></a>01041 <span class="keywordtype">id</span> BIGINT ENCODING (compresstype=RLE_TYPE),
<a name="l01042"></a>01042 fid INT ENCODING (compresstype=RLE_TYPE),
<a name="l01043"></a>01043 fval TEXT ENCODING (compresstype=RLE_TYPE),
<a name="l01044"></a>01044 is_cont BOOL ENCODING (compresstype=RLE_TYPE),
<a name="l01045"></a>01045 class INT ENCODING (compresstype=RLE_TYPE)
<a name="l01046"></a>01046 )
<a name="l01047"></a>01047 WITH(appendonly=true, orientation=column)
<a name="l01048"></a>01048 DISTRIBUTED BY(<span class="keywordtype">id</span>)&#39;,
<a name="l01049"></a>01049 ARRAY[
<a name="l01050"></a>01050 breakup_tbl_name
<a name="l01051"></a>01051 ]
<a name="l01052"></a>01052 );
<a name="l01053"></a>01053 &lt;&lt;&lt;, &gt;&gt;&gt;
<a name="l01054"></a>01054 curstmt = MADLIB_SCHEMA.__format
<a name="l01055"></a>01055 (
<a name="l01056"></a>01056 &#39;CREATE TEMP TABLE %
<a name="l01057"></a>01057 (
<a name="l01058"></a>01058 <span class="keywordtype">id</span> BIGINT,
<a name="l01059"></a>01059 fid INT,
<a name="l01060"></a>01060 fval TEXT,
<a name="l01061"></a>01061 is_cont BOOL,
<a name="l01062"></a>01062 class INT
<a name="l01063"></a>01063 )
<a name="l01064"></a>01064 m4_ifdef(`__GREENPLUM__&#39;, `DISTRIBUTED BY (<span class="keywordtype">id</span>)&#39;)&#39;,
<a name="l01065"></a>01065 ARRAY[
<a name="l01066"></a>01066 breakup_tbl_name
<a name="l01067"></a>01067 ]
<a name="l01068"></a>01068 );
<a name="l01069"></a>01069 &lt;&lt;&lt;)
<a name="l01070"></a>01070 m4_changequote(&gt;&gt;&gt;`&lt;&lt;&lt;, &gt;&gt;&gt;&#39;&lt;&lt;&lt;)
<a name="l01071"></a>01071
<a name="l01072"></a>01072 EXECUTE curstmt;
<a name="l01073"></a>01073
<a name="l01074"></a>01074 -- the supported missing value representation (&#39; &#39;, &#39;?&#39; and NULL) will
<a name="l01075"></a>01075 -- be replace with NULL for easy processing later.
<a name="l01076"></a>01076 -- the function __to_char is needed because on some databases an explicit
<a name="l01077"></a>01077 -- cast to text is unavailable.
<a name="l01078"></a>01078 IF (h2hmv_routine_id = 1) THEN
<a name="l01079"></a>01079 where_txt = &#39; WHERE NULLIF(NULLIF(btrim(fval, &#39;&#39; &#39;&#39;), &#39;&#39;?&#39;&#39;), &#39;&#39;&#39;&#39;) IS NOT NULL&#39;;
<a name="l01080"></a>01080 ELSE
<a name="l01081"></a>01081 fval_txt = &#39; CASE WHEN NULLIF(NULLIF(btrim(fval, &#39;&#39; &#39;&#39;), &#39;&#39;?&#39;&#39;), &#39;&#39;&#39;&#39;) IS NULL THEN
<a name="l01082"></a>01082 NULL
<a name="l01083"></a>01083 ELSE
<a name="l01084"></a>01084 fval
<a name="l01085"></a>01085 END &#39;;
<a name="l01086"></a>01086 END IF;
<a name="l01087"></a>01087
<a name="l01088"></a>01088 IF (cls_col_name IS NULL) THEN
<a name="l01089"></a>01089 -- if the kv_cls_name is null, then the class column will be null
<a name="l01090"></a>01090 curstmt = MADLIB_SCHEMA.__format
<a name="l01091"></a>01091 (
<a name="l01092"></a>01092 &#39;INSERT INTO %(<span class="keywordtype">id</span>, fid, fval, is_cont, class)
<a name="l01093"></a>01093 SELECT <span class="keywordtype">id</span>, fid, % as fval, is_cont, class
<a name="l01094"></a>01094 FROM
<a name="l01095"></a>01095 (
<a name="l01096"></a>01096 SELECT %, generate_series(1, %) as fid,
<a name="l01097"></a>01097 unnest(array[MADLIB_SCHEMA.__to_char(%)]) as fval,
<a name="l01098"></a>01098 unnest(array[&#39;&#39;%&#39;&#39;::BOOL]::BOOL[]) as is_cont, NULL as class
<a name="l01099"></a>01099 FROM
<a name="l01100"></a>01100 % t1
<a name="l01101"></a>01101 ) t
<a name="l01102"></a>01102 %&#39;,
<a name="l01103"></a>01103 ARRAY[
<a name="l01104"></a>01104 breakup_tbl_name,
<a name="l01105"></a>01105 fval_txt,
<a name="l01106"></a>01106 id_col_name,
<a name="l01107"></a>01107 array_upper(is_conts, 1)::TEXT,
<a name="l01108"></a>01108 array_to_string(attr_col_names, &#39;), MADLIB_SCHEMA.__to_char(&#39;),
<a name="l01109"></a>01109 array_to_string(is_conts, &#39;&#39;&#39;,&#39;&#39;&#39;),
<a name="l01110"></a>01110 input_tbl_name,
<a name="l01111"></a>01111 where_txt
<a name="l01112"></a>01112 ]
<a name="l01113"></a>01113 );
<a name="l01114"></a>01114
<a name="l01115"></a>01115 ELSE
<a name="l01116"></a>01116 -- for scoring, as the class column may have some values which are not
<a name="l01117"></a>01117 -- appear in the training table, we need use left join here to ensure all
<a name="l01118"></a>01118 -- the rows of the input table were breakup. Here, we simple encode those
<a name="l01119"></a>01119 -- values to 0. Therefore, during scoring, the samples with 0 (encoded
<a name="l01120"></a>01120 -- value) as class label will be recognized as mis-classified.
<a name="l01121"></a>01121 curstmt = MADLIB_SCHEMA.__format
<a name="l01122"></a>01122 (
<a name="l01123"></a>01123 &#39;INSERT INTO %(<span class="keywordtype">id</span>, fid, fval, is_cont, class)
<a name="l01124"></a>01124 SELECT <span class="keywordtype">id</span>, fid, % as fval, is_cont, class
<a name="l01125"></a>01125 FROM
<a name="l01126"></a>01126 (
<a name="l01127"></a>01127 SELECT %, generate_series(1, %) as fid,
<a name="l01128"></a>01128 unnest(array[MADLIB_SCHEMA.__to_char(%)]) as fval,
<a name="l01129"></a>01129 unnest(array[&#39;&#39;%&#39;&#39;::BOOL]::BOOL[]) as is_cont,
<a name="l01130"></a>01130 coalesce(code, 0::INT) as class
<a name="l01131"></a>01131 FROM
<a name="l01132"></a>01132 % t1 LEFT JOIN % t2
<a name="l01133"></a>01133 ON MADLIB_SCHEMA.__to_char(t1.%) = t2.fval
<a name="l01134"></a>01134 ) t
<a name="l01135"></a>01135 %&#39;,
<a name="l01136"></a>01136 ARRAY[
<a name="l01137"></a>01137 breakup_tbl_name,
<a name="l01138"></a>01138 fval_txt,
<a name="l01139"></a>01139 id_col_name,
<a name="l01140"></a>01140 array_upper(is_conts, 1)::TEXT,
<a name="l01141"></a>01141 array_to_string(attr_col_names, &#39;), MADLIB_SCHEMA.__to_char(&#39;),
<a name="l01142"></a>01142 array_to_string(is_conts, &#39;&#39;&#39;,&#39;&#39;&#39;),
<a name="l01143"></a>01143 input_tbl_name,
<a name="l01144"></a>01144 kv_cls_name,
<a name="l01145"></a>01145 cls_col_name,
<a name="l01146"></a>01146 where_txt
<a name="l01147"></a>01147 ]
<a name="l01148"></a>01148 );
<a name="l01149"></a>01149 END IF;
<a name="l01150"></a>01150
<a name="l01151"></a>01151 EXECUTE curstmt;
<a name="l01152"></a>01152
<a name="l01153"></a>01153 IF (verbosity &gt; 0) THEN
<a name="l01154"></a>01154 RAISE INFO &#39;%&#39;, curstmt;
<a name="l01155"></a>01155 RAISE INFO &#39;time of breaking up the training table:%&#39;,
<a name="l01156"></a>01156 clock_timestamp() - exec_begin;
<a name="l01157"></a>01157 END IF;
<a name="l01158"></a>01158 END
<a name="l01159"></a>01159 $$ LANGUAGE PLPGSQL;
<a name="l01160"></a>01160
<a name="l01161"></a>01161
<a name="l01162"></a>01162 <span class="comment">/*</span>
<a name="l01163"></a>01163 <span class="comment"> * @brief Generate the vertical encoded table from the breakup table.</span>
<a name="l01164"></a>01164 <span class="comment"> *</span>
<a name="l01165"></a>01165 <span class="comment"> * @param breakup_tbl_name The full name of the breakup table.</span>
<a name="l01166"></a>01166 <span class="comment"> * @param enc_tbl_name The name of the encoded table. its schema is:</span>
<a name="l01167"></a>01167 <span class="comment"> * id BIGINT, </span>
<a name="l01168"></a>01168 <span class="comment"> * fid INT, </span>
<a name="l01169"></a>01169 <span class="comment"> * fval FLOAT8, </span>
<a name="l01170"></a>01170 <span class="comment"> * is_cont BOOL, </span>
<a name="l01171"></a>01171 <span class="comment"> * class INT</span>
<a name="l01172"></a>01172 <span class="comment"> * @param kv_attr_name The name of the key-value table contains the encoded</span>
<a name="l01173"></a>01173 <span class="comment"> * result for all the features. For continuous feature,</span>
<a name="l01174"></a>01174 <span class="comment"> * it kept the average value of it if in &#39;explicit&#39; mode;</span>
<a name="l01175"></a>01175 <span class="comment"> * nothing will kept if in &#39;ignore&#39; mode.</span>
<a name="l01176"></a>01176 <span class="comment"> * @param is_tbl_tmp If ture we will create the encoded table as a temp one.</span>
<a name="l01177"></a>01177 <span class="comment"> * @param verbosity &gt; 0 means this function runs in verbose mode. </span>
<a name="l01178"></a>01178 <span class="comment"> *</span>
<a name="l01179"></a>01179 <span class="comment"> */</span>
<a name="l01180"></a>01180 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__gen_vertical_encoded_table
<a name="l01181"></a>01181 (
<a name="l01182"></a>01182 breakup_tbl_name TEXT,
<a name="l01183"></a>01183 enc_tbl_name TEXT,
<a name="l01184"></a>01184 kv_attr_name TEXT,
<a name="l01185"></a>01185 is_tbl_tmp BOOL,
<a name="l01186"></a>01186 verbosity INT
<a name="l01187"></a>01187 )
<a name="l01188"></a>01188 RETURNS VOID AS $$
<a name="l01189"></a>01189 DECLARE
<a name="l01190"></a>01190 curstmt TEXT;
<a name="l01191"></a>01191 exec_begin TIMESTAMP;
<a name="l01192"></a>01192 tmp_txt TEXT = &#39;&#39;;
<a name="l01193"></a>01193 BEGIN
<a name="l01194"></a>01194 IF (is_tbl_tmp) THEN
<a name="l01195"></a>01195 tmp_txt = &#39; TEMP &#39;;
<a name="l01196"></a>01196 END IF;
<a name="l01197"></a>01197
<a name="l01198"></a>01198 EXECUTE &#39;DROP TABLE IF EXISTS &#39; || enc_tbl_name;
<a name="l01199"></a>01199
<a name="l01200"></a>01200 m4_changequote(`&gt;&gt;&gt;&#39;, `&lt;&lt;&lt;&#39;)
<a name="l01201"></a>01201 m4_ifdef(&gt;&gt;&gt;__GREENPLUM_GE_4_2_1__&lt;&lt;&lt;, &gt;&gt;&gt;
<a name="l01202"></a>01202 curstmt = MADLIB_SCHEMA.__format
<a name="l01203"></a>01203 (
<a name="l01204"></a>01204 &#39;CREATE % TABLE %
<a name="l01205"></a>01205 (
<a name="l01206"></a>01206 <span class="keywordtype">id</span> BIGINT ENCODING (compresstype=RLE_TYPE),
<a name="l01207"></a>01207 fid INT ENCODING (compresstype=RLE_TYPE),
<a name="l01208"></a>01208 fval FLOAT8 ENCODING (compresstype=RLE_TYPE),
<a name="l01209"></a>01209 is_cont BOOL ENCODING (compresstype=RLE_TYPE),
<a name="l01210"></a>01210 class INT ENCODING (compresstype=RLE_TYPE)
<a name="l01211"></a>01211 )
<a name="l01212"></a>01212 WITH(appendonly=true, orientation=column)
<a name="l01213"></a>01213 DISTRIBUTED BY(<span class="keywordtype">id</span>)&#39;,
<a name="l01214"></a>01214 ARRAY[
<a name="l01215"></a>01215 tmp_txt,
<a name="l01216"></a>01216 enc_tbl_name
<a name="l01217"></a>01217 ]
<a name="l01218"></a>01218 );
<a name="l01219"></a>01219 &lt;&lt;&lt;, &gt;&gt;&gt;
<a name="l01220"></a>01220 curstmt = MADLIB_SCHEMA.__format
<a name="l01221"></a>01221 (
<a name="l01222"></a>01222 &#39;CREATE % TABLE %
<a name="l01223"></a>01223 (
<a name="l01224"></a>01224 <span class="keywordtype">id</span> BIGINT,
<a name="l01225"></a>01225 fid INT,
<a name="l01226"></a>01226 fval FLOAT8,
<a name="l01227"></a>01227 is_cont BOOL,
<a name="l01228"></a>01228 class INT
<a name="l01229"></a>01229 )
<a name="l01230"></a>01230 m4_ifdef(`__GREENPLUM__&#39;, `DISTRIBUTED BY (<span class="keywordtype">id</span>)&#39;)&#39;,
<a name="l01231"></a>01231 ARRAY[
<a name="l01232"></a>01232 tmp_txt,
<a name="l01233"></a>01233 enc_tbl_name
<a name="l01234"></a>01234 ]
<a name="l01235"></a>01235 );
<a name="l01236"></a>01236 &lt;&lt;&lt;)
<a name="l01237"></a>01237 m4_changequote(&gt;&gt;&gt;`&lt;&lt;&lt;, &gt;&gt;&gt;&#39;&lt;&lt;&lt;)
<a name="l01238"></a>01238
<a name="l01239"></a>01239 IF (verbosity &gt; 0) THEN
<a name="l01240"></a>01240 RAISE INFO &#39;%&#39;, curstmt;
<a name="l01241"></a>01241 END IF;
<a name="l01242"></a>01242 EXECUTE curstmt;
<a name="l01243"></a>01243
<a name="l01244"></a>01244 -- Generating the encoded table through join the breakup table with
<a name="l01245"></a>01245 -- the KV table for all the features
<a name="l01246"></a>01246 curstmt = MADLIB_SCHEMA.__format
<a name="l01247"></a>01247 (
<a name="l01248"></a>01248 &#39;INSERT INTO %(<span class="keywordtype">id</span>, fid, fval, is_cont, class)
<a name="l01249"></a>01249 SELECT p.<span class="keywordtype">id</span> AS <span class="keywordtype">id</span>, p.fid AS fid,
<a name="l01250"></a>01250 CASE WHEN (p.is_cont AND p.fval IS NOT NULL) THEN
<a name="l01251"></a>01251 p.fval::FLOAT8
<a name="l01252"></a>01252 ELSE
<a name="l01253"></a>01253 m.code::FLOAT8
<a name="l01254"></a>01254 END AS fval,
<a name="l01255"></a>01255 p.is_cont AS is_cont,
<a name="l01256"></a>01256 p.class::INT AS class
<a name="l01257"></a>01257 FROM
<a name="l01258"></a>01258 % p LEFT JOIN % m
<a name="l01259"></a>01259 ON
<a name="l01260"></a>01260 m.fid = p.fid AND
<a name="l01261"></a>01261 (coalesce(m.fval, &#39;&#39;&#39;&#39;) = (coalesce(p.fval, &#39;&#39;&#39;&#39;)))&#39;,
<a name="l01262"></a>01262 ARRAY[
<a name="l01263"></a>01263 enc_tbl_name,
<a name="l01264"></a>01264 breakup_tbl_name,
<a name="l01265"></a>01265 kv_attr_name
<a name="l01266"></a>01266 ]
<a name="l01267"></a>01267 );
<a name="l01268"></a>01268
<a name="l01269"></a>01269 IF (verbosity &gt; 0) THEN
<a name="l01270"></a>01270 RAISE INFO &#39;%&#39;, curstmt;
<a name="l01271"></a>01271 END IF;
<a name="l01272"></a>01272 EXECUTE curstmt;
<a name="l01273"></a>01273 END
<a name="l01274"></a>01274 $$ LANGUAGE PLPGSQL;
<a name="l01275"></a>01275
<a name="l01276"></a>01276
<a name="l01277"></a>01277 <span class="comment">/*</span>
<a name="l01278"></a>01278 <span class="comment"> * @brief Generate the horizontal table from a given vertical table.</span>
<a name="l01279"></a>01279 <span class="comment"> *</span>
<a name="l01280"></a>01280 <span class="comment"> * @param hor_tbl_name The full name of the horizontal table.</span>
<a name="l01281"></a>01281 <span class="comment"> * @param ver_tbl_name The full name of the vertical table.</span>
<a name="l01282"></a>01282 <span class="comment"> * @param meta_tbl_name The full name of the meta data table.</span>
<a name="l01283"></a>01283 <span class="comment"> * @param verbosity &gt; 0 means this function runs in verbose mode. </span>
<a name="l01284"></a>01284 <span class="comment"> *</span>
<a name="l01285"></a>01285 <span class="comment"> */</span>
<a name="l01286"></a>01286 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__gen_horizontal_encoded_table
<a name="l01287"></a>01287 (
<a name="l01288"></a>01288 hor_tbl_name TEXT,
<a name="l01289"></a>01289 ver_tbl_name TEXT,
<a name="l01290"></a>01290 attr_count INT,
<a name="l01291"></a>01291 verbosity INT
<a name="l01292"></a>01292 )
<a name="l01293"></a>01293 RETURNS VOID AS $$
<a name="l01294"></a>01294 DECLARE
<a name="l01295"></a>01295 curstmt TEXT;
<a name="l01296"></a>01296 exec_begin TIMESTAMP;
<a name="l01297"></a>01297 BEGIN
<a name="l01298"></a>01298 exec_begin = clock_timestamp();
<a name="l01299"></a>01299
<a name="l01300"></a>01300 EXECUTE &#39;DROP TABLE IF EXISTS &#39; || hor_tbl_name;
<a name="l01301"></a>01301 curstmt = MADLIB_SCHEMA.__format
<a name="l01302"></a>01302 (
<a name="l01303"></a>01303 &#39;CREATE TEMP TABLE %(<span class="keywordtype">id</span>, fvals, class) AS
<a name="l01304"></a>01304 SELECT
<a name="l01305"></a>01305 <span class="keywordtype">id</span>,
<a name="l01306"></a>01306 MADLIB_SCHEMA.__array_indexed_agg(fval, %, fid) as fvals,
<a name="l01307"></a>01307 min(class)::INT as class
<a name="l01308"></a>01308 FROM %
<a name="l01309"></a>01309 GROUP BY <span class="keywordtype">id</span>
<a name="l01310"></a>01310 m4_ifdef(`__GREENPLUM__&#39;, `DISTRIBUTED BY (<span class="keywordtype">id</span>)&#39;)&#39;,
<a name="l01311"></a>01311 ARRAY[
<a name="l01312"></a>01312 hor_tbl_name,
<a name="l01313"></a>01313 attr_count::TEXT,
<a name="l01314"></a>01314 ver_tbl_name
<a name="l01315"></a>01315 ]
<a name="l01316"></a>01316 );
<a name="l01317"></a>01317 EXECUTE curstmt;
<a name="l01318"></a>01318
<a name="l01319"></a>01319 IF (verbosity &gt; 0) THEN
<a name="l01320"></a>01320 RAISE INFO &#39;time of generating horizontal table from vertical table:%&#39;,
<a name="l01321"></a>01321 clock_timestamp() - exec_begin;
<a name="l01322"></a>01322 END IF;
<a name="l01323"></a>01323 END
<a name="l01324"></a>01324 $$ LANGUAGE PLPGSQL;
<a name="l01325"></a>01325
<a name="l01326"></a>01326
<a name="l01327"></a>01327 <span class="comment">/*</span>
<a name="l01328"></a>01328 <span class="comment"> * @brief Encode the continuous and discrete features and the class column.</span>
<a name="l01329"></a>01329 <span class="comment"> * In &#39;ignore&#39; mode, for each discrete feature/class, we will use </span>
<a name="l01330"></a>01330 <span class="comment"> * continuous integer to encode each distinct value (null value </span>
<a name="l01331"></a>01331 <span class="comment"> * will be excluded). Continuous feature will not be processed. </span>
<a name="l01332"></a>01332 <span class="comment"> * In &#39;explicit&#39; mode, null value will be included for discrete </span>
<a name="l01333"></a>01333 <span class="comment"> * feature. For continuous feature, null value will be replaced by</span>
<a name="l01334"></a>01334 <span class="comment"> * the average value of this feature.</span>
<a name="l01335"></a>01335 <span class="comment"> * </span>
<a name="l01336"></a>01336 <span class="comment"> * @param kv_attr_name The name of the key-value table contains the encoded</span>
<a name="l01337"></a>01337 <span class="comment"> * result for all the features. For continuous feature,</span>
<a name="l01338"></a>01338 <span class="comment"> * it kept the average value of it if in &#39;explicit&#39; mode;</span>
<a name="l01339"></a>01339 <span class="comment"> * nothing will kept if in &#39;ignore&#39; mode.</span>
<a name="l01340"></a>01340 <span class="comment"> * @param breakup_tbl_name The name of the breakup table from raw training table.</span>
<a name="l01341"></a>01341 <span class="comment"> * @param h2hmv_routine_id The ID of the routine which specifies </span>
<a name="l01342"></a>01342 <span class="comment"> * How to handle missing value(h2hmv).</span>
<a name="l01343"></a>01343 <span class="comment"> * @param verbosity &gt; 0 means this function runs in verbose mode.</span>
<a name="l01344"></a>01344 <span class="comment"> *</span>
<a name="l01345"></a>01345 <span class="comment"> *</span>
<a name="l01346"></a>01346 <span class="comment"> */</span>
<a name="l01347"></a>01347 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__encode_columns
<a name="l01348"></a>01348 (
<a name="l01349"></a>01349 kv_attr_name TEXT,
<a name="l01350"></a>01350 breakup_tbl_name TEXT,
<a name="l01351"></a>01351 h2hmv_routine_id INT,
<a name="l01352"></a>01352 verbosity INT
<a name="l01353"></a>01353 )
<a name="l01354"></a>01354 RETURNS VOID AS $$
<a name="l01355"></a>01355 DECLARE
<a name="l01356"></a>01356 curstmt TEXT;
<a name="l01357"></a>01357 tmp_txt TEXT = &#39;&#39;;
<a name="l01358"></a>01358 BEGIN
<a name="l01359"></a>01359
<a name="l01360"></a>01360 -- This table will be used to generate the KV table
<a name="l01361"></a>01361 -- for the discrete features and retrieve the number
<a name="l01362"></a>01362 -- of distinct values for a feature outside of this
<a name="l01363"></a>01363 -- function. Therefore, don&#39;t drop this table in this
<a name="l01364"></a>01364 -- function.
<a name="l01365"></a>01365 EXECUTE &#39;DROP TABLE IF EXISTS tmp_dist_table&#39;;
<a name="l01366"></a>01366 curstmt = MADLIB_SCHEMA.__format
<a name="l01367"></a>01367 (
<a name="l01368"></a>01368 &#39;CREATE TEMP TABLE tmp_dist_table AS
<a name="l01369"></a>01369 SELECT fid, fval, is_cont
<a name="l01370"></a>01370 FROM %
<a name="l01371"></a>01371 GROUP BY fid, fval, is_cont&#39;,
<a name="l01372"></a>01372 ARRAY[
<a name="l01373"></a>01373 breakup_tbl_name
<a name="l01374"></a>01374 ]
<a name="l01375"></a>01375 );
<a name="l01376"></a>01376 IF (verbosity &gt; 0) THEN
<a name="l01377"></a>01377 RAISE INFO &#39;%&#39;, curstmt;
<a name="l01378"></a>01378 END IF;
<a name="l01379"></a>01379
<a name="l01380"></a>01380 EXECUTE curstmt;
<a name="l01381"></a>01381
<a name="l01382"></a>01382 -- create the KV table for all the features and
<a name="l01383"></a>01383 -- populate the keys of the discrete features
<a name="l01384"></a>01384 -- to the table.
<a name="l01385"></a>01385 EXECUTE &#39;DROP TABLE IF EXISTS &#39; || kv_attr_name;
<a name="l01386"></a>01386 curstmt = MADLIB_SCHEMA.__format
<a name="l01387"></a>01387 (
<a name="l01388"></a>01388 &#39;CREATE TABLE %(fid, fval, code) AS
<a name="l01389"></a>01389 SELECT
<a name="l01390"></a>01390 fid,
<a name="l01391"></a>01391 fval,
<a name="l01392"></a>01392 (rank() OVER (PARTITION BY fid ORDER BY fval))::FLOAT8 AS code
<a name="l01393"></a>01393 FROM tmp_dist_table
<a name="l01394"></a>01394 WHERE (NOT is_cont)
<a name="l01395"></a>01395 m4_ifdef(`__GREENPLUM__&#39;, `DISTRIBUTED BY (fid, fval)&#39;)&#39;,
<a name="l01396"></a>01396 ARRAY[
<a name="l01397"></a>01397 kv_attr_name
<a name="l01398"></a>01398 ]
<a name="l01399"></a>01399 );
<a name="l01400"></a>01400 IF (verbosity &gt; 0) THEN
<a name="l01401"></a>01401 RAISE INFO &#39;%&#39;, curstmt;
<a name="l01402"></a>01402 END IF;
<a name="l01403"></a>01403 EXECUTE curstmt;
<a name="l01404"></a>01404
<a name="l01405"></a>01405 -- In &quot;explicit&quot; mode, we need to replace the missing
<a name="l01406"></a>01406 -- value with the average value. Therefore, we keep
<a name="l01407"></a>01407 -- those values to the KV table.
<a name="l01408"></a>01408 IF (h2hmv_routine_id = 2) THEN
<a name="l01409"></a>01409 curstmt = MADLIB_SCHEMA.__format
<a name="l01410"></a>01410 (
<a name="l01411"></a>01411 &#39;INSERT INTO %(fid, fval, code)
<a name="l01412"></a>01412 SELECT
<a name="l01413"></a>01413 fid,
<a name="l01414"></a>01414 null,
<a name="l01415"></a>01415 coalesce(<a class="code" href="linalg_8sql__in.html#a1aa37f73fb1cd8d7d106aa518dd8c0b4" title="Compute the average of vectors.">avg</a>(fval::FLOAT8), 0.0)
<a name="l01416"></a>01416 FROM
<a name="l01417"></a>01417 % s
<a name="l01418"></a>01418 WHERE is_cont
<a name="l01419"></a>01419 GROUP BY fid&#39;,
<a name="l01420"></a>01420 ARRAY[
<a name="l01421"></a>01421 kv_attr_name,
<a name="l01422"></a>01422 breakup_tbl_name
<a name="l01423"></a>01423 ]
<a name="l01424"></a>01424 );
<a name="l01425"></a>01425 IF (verbosity &gt; 0) THEN
<a name="l01426"></a>01426 RAISE INFO &#39;%&#39;, curstmt;
<a name="l01427"></a>01427 END IF;
<a name="l01428"></a>01428
<a name="l01429"></a>01429 EXECUTE curstmt;
<a name="l01430"></a>01430 END IF;
<a name="l01431"></a>01431 END
<a name="l01432"></a>01432 $$ LANGUAGE PLPGSQL;
<a name="l01433"></a>01433
<a name="l01434"></a>01434
<a name="l01435"></a>01435 <span class="comment">/*</span>
<a name="l01436"></a>01436 <span class="comment"> * @brief Encode a table for training in C4.5 and RF.</span>
<a name="l01437"></a>01437 <span class="comment"> *</span>
<a name="l01438"></a>01438 <span class="comment"> * @param input_tbl_name The full name of the input table.</span>
<a name="l01439"></a>01439 <span class="comment"> * @param id_col_name The name of id column.</span>
<a name="l01440"></a>01440 <span class="comment"> * @param feature_names An array contains all the feature. If it&#39;s null, </span>
<a name="l01441"></a>01441 <span class="comment"> * we will get all the columns of the input table.</span>
<a name="l01442"></a>01442 <span class="comment"> * @param cls_col_name The name of class column.</span>
<a name="l01443"></a>01443 <span class="comment"> * @param cont_attr_names An array contains all the continuous feature.</span>
<a name="l01444"></a>01444 <span class="comment"> * Null means no continuous feature.</span>
<a name="l01445"></a>01445 <span class="comment"> * @param enc_table_name The full name of the encoded table.</span>
<a name="l01446"></a>01446 <span class="comment"> * @param meta_tbl_name The full name of the metatable.</span>
<a name="l01447"></a>01447 <span class="comment"> * @param h2hmv_routine_id The ID of the routine which specifies </span>
<a name="l01448"></a>01448 <span class="comment"> * How to handle missing value(h2hmv).</span>
<a name="l01449"></a>01449 <span class="comment"> * @param verbosity &gt; 0 means this function runs in verbose mode. </span>
<a name="l01450"></a>01450 <span class="comment"> *</span>
<a name="l01451"></a>01451 <span class="comment"> */</span>
<a name="l01452"></a>01452 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__encode_table
<a name="l01453"></a>01453 (
<a name="l01454"></a>01454 input_tbl_name TEXT,
<a name="l01455"></a>01455 id_col_name TEXT,
<a name="l01456"></a>01456 feature_names TEXT[],
<a name="l01457"></a>01457 cls_col_name TEXT,
<a name="l01458"></a>01458 cont_attr_names TEXT[],
<a name="l01459"></a>01459 enc_table_name TEXT,
<a name="l01460"></a>01460 meta_tbl_name TEXT,
<a name="l01461"></a>01461 h2hmv_routine_id INT,
<a name="l01462"></a>01462 verbosity INT
<a name="l01463"></a>01463 )
<a name="l01464"></a>01464 RETURNS VOID AS $$
<a name="l01465"></a>01465 DECLARE
<a name="l01466"></a>01466 curstmt TEXT := &#39;&#39;;
<a name="l01467"></a>01467 attr_col_names TEXT[];
<a name="l01468"></a>01468 lit_attr_col_names TEXT[];
<a name="l01469"></a>01469 kv_attr_name TEXT := enc_table_name || &#39;_col&#39;;
<a name="l01470"></a>01470 kv_cls_name TEXT := enc_table_name || &#39;_class&#39;;
<a name="l01471"></a>01471 is_conts BOOL[];
<a name="l01472"></a>01472 breakup_tbl_name TEXT := &#39;tmp_breakup_table&#39;;
<a name="l01473"></a>01473 exec_begin TIMESTAMP;
<a name="l01474"></a>01474 ret MADLIB_SCHEMA.__enc_tbl_result;
<a name="l01475"></a>01475 BEGIN
<a name="l01476"></a>01476 exec_begin = clock_timestamp();
<a name="l01477"></a>01477
<a name="l01478"></a>01478 -- validate the training table
<a name="l01479"></a>01479 PERFORM MADLIB_SCHEMA.__validate_input_table
<a name="l01480"></a>01480 (
<a name="l01481"></a>01481 input_tbl_name,
<a name="l01482"></a>01482 feature_names,
<a name="l01483"></a>01483 id_col_name,
<a name="l01484"></a>01484 cls_col_name
<a name="l01485"></a>01485 );
<a name="l01486"></a>01486
<a name="l01487"></a>01487 -- create metatable
<a name="l01488"></a>01488 PERFORM MADLIB_SCHEMA.__create_metatable(meta_tbl_name);
<a name="l01489"></a>01489
<a name="l01490"></a>01490 -- retrieve all the features&#39; names
<a name="l01491"></a>01491 IF (feature_names IS NULL) THEN
<a name="l01492"></a>01492 m4_changequote(`&gt;&gt;&gt;&#39;, `&lt;&lt;&lt;&#39;)
<a name="l01493"></a>01493 m4_ifdef(`__HAS_ORDERED_AGGREGATES__&#39;, &gt;&gt;&gt;
<a name="l01494"></a>01494 curstmt = MADLIB_SCHEMA.__format
<a name="l01495"></a>01495 (
<a name="l01496"></a>01496 &#39;SELECT <a class="code" href="array__ops_8sql__in.html#af9f60293134ba4ce05b1f97f6faba822" title="ARRAY_AGG aggregate for compatibility with GPDB &lt; 4.1 and Postgres &lt; 9.0 This is a slower solution th...">array_agg</a>(quote_ident(attname) ORDER BY attname) as attnames
<a name="l01497"></a>01497 FROM pg_attribute
<a name="l01498"></a>01498 WHERE attrelid = &#39;&#39;%&#39;&#39;::regclass and attnum &gt; 0 AND
<a name="l01499"></a>01499 attname &lt;&gt; &#39;&#39;%&#39;&#39; AND
<a name="l01500"></a>01500 attname &lt;&gt; &#39;&#39;%&#39;&#39; AND
<a name="l01501"></a>01501 NOT attisdropped;&#39;,
<a name="l01502"></a>01502 ARRAY[
<a name="l01503"></a>01503 input_tbl_name,
<a name="l01504"></a>01504 id_col_name,
<a name="l01505"></a>01505 cls_col_name
<a name="l01506"></a>01506 ]
<a name="l01507"></a>01507 );
<a name="l01508"></a>01508
<a name="l01509"></a>01509 EXECUTE curstmt INTO attr_col_names;
<a name="l01510"></a>01510 &lt;&lt;&lt;, &gt;&gt;&gt;
<a name="l01511"></a>01511 curstmt = MADLIB_SCHEMA.__format
<a name="l01512"></a>01512 (
<a name="l01513"></a>01513 &#39;SELECT ARRAY
<a name="l01514"></a>01514 (
<a name="l01515"></a>01515 SELECT quote_ident(attname)
<a name="l01516"></a>01516 FROM pg_attribute
<a name="l01517"></a>01517 WHERE attrelid = &#39;&#39;%&#39;&#39;::regclass and attnum &gt; 0 AND
<a name="l01518"></a>01518 attname &lt;&gt; &#39;&#39;%&#39;&#39; AND
<a name="l01519"></a>01519 attname &lt;&gt; &#39;&#39;%&#39;&#39; AND
<a name="l01520"></a>01520 NOT attisdropped
<a name="l01521"></a>01521 ORDER BY attname
<a name="l01522"></a>01522 LIMIT ALL
<a name="l01523"></a>01523 )&#39;,
<a name="l01524"></a>01524 ARRAY[
<a name="l01525"></a>01525 input_tbl_name,
<a name="l01526"></a>01526 id_col_name,
<a name="l01527"></a>01527 cls_col_name
<a name="l01528"></a>01528 ]
<a name="l01529"></a>01529 );
<a name="l01530"></a>01530 EXECUTE curstmt INTO attr_col_names;
<a name="l01531"></a>01531 &lt;&lt;&lt;)
<a name="l01532"></a>01532 m4_changequote(&gt;&gt;&gt;`&lt;&lt;&lt;, &gt;&gt;&gt;&#39;&lt;&lt;&lt;)
<a name="l01533"></a>01533 ELSE
<a name="l01534"></a>01534 attr_col_names = MADLIB_SCHEMA.__array_sort(feature_names);
<a name="l01535"></a>01535 END IF;
<a name="l01536"></a>01536
<a name="l01537"></a>01537 -- an array contains if a feature is continuous or not
<a name="l01538"></a>01538 -- the subscript is corresponding to the feature&#39;s ID
<a name="l01539"></a>01539 is_conts = MADLIB_SCHEMA.__array_elem_in(cont_attr_names, attr_col_names);
<a name="l01540"></a>01540
<a name="l01541"></a>01541 ret.pre_proc_time = clock_timestamp() - exec_begin;
<a name="l01542"></a>01542 exec_begin = clock_timestamp();
<a name="l01543"></a>01543
<a name="l01544"></a>01544 -- create the KV table for the class column.
<a name="l01545"></a>01545 EXECUTE &#39;DROP TABLE IF EXISTS &#39; || kv_cls_name;
<a name="l01546"></a>01546 curstmt = MADLIB_SCHEMA.__format
<a name="l01547"></a>01547 (
<a name="l01548"></a>01548 &#39;CREATE TABLE % AS
<a name="l01549"></a>01549 SELECT 0 as fid,
<a name="l01550"></a>01550 MADLIB_SCHEMA.__to_char(%) AS fval,
<a name="l01551"></a>01551 rank() OVER (ORDER BY %) AS code
<a name="l01552"></a>01552 FROM
<a name="l01553"></a>01553 (
<a name="l01554"></a>01554 SELECT % FROM % GROUP BY %
<a name="l01555"></a>01555 ) t
<a name="l01556"></a>01556 m4_ifdef(`__GREENPLUM__&#39;, `DISTRIBUTED BY (fval)&#39;)&#39;,
<a name="l01557"></a>01557 ARRAY[
<a name="l01558"></a>01558 kv_cls_name,
<a name="l01559"></a>01559 cls_col_name,
<a name="l01560"></a>01560 cls_col_name,
<a name="l01561"></a>01561 cls_col_name,
<a name="l01562"></a>01562 input_tbl_name,
<a name="l01563"></a>01563 cls_col_name
<a name="l01564"></a>01564 ]
<a name="l01565"></a>01565 );
<a name="l01566"></a>01566 IF (verbosity &gt; 0) THEN
<a name="l01567"></a>01567 RAISE INFO &#39;%&#39;, curstmt;
<a name="l01568"></a>01568 END IF;
<a name="l01569"></a>01569 EXECUTE curstmt;
<a name="l01570"></a>01570
<a name="l01571"></a>01571 ret.gen_kv_time = clock_timestamp() - exec_begin;
<a name="l01572"></a>01572 exec_begin = clock_timestamp();
<a name="l01573"></a>01573
<a name="l01574"></a>01574 -- breakup each record of the training table and keep the result
<a name="l01575"></a>01575 -- into a new table.
<a name="l01576"></a>01576 PERFORM MADLIB_SCHEMA.__breakup_table
<a name="l01577"></a>01577 (
<a name="l01578"></a>01578 input_tbl_name,
<a name="l01579"></a>01579 breakup_tbl_name,
<a name="l01580"></a>01580 kv_cls_name,
<a name="l01581"></a>01581 id_col_name,
<a name="l01582"></a>01582 cls_col_name,
<a name="l01583"></a>01583 attr_col_names,
<a name="l01584"></a>01584 is_conts,
<a name="l01585"></a>01585 h2hmv_routine_id,
<a name="l01586"></a>01586 verbosity
<a name="l01587"></a>01587 );
<a name="l01588"></a>01588
<a name="l01589"></a>01589 ret.breakup_tbl_time= clock_timestamp() - exec_begin;
<a name="l01590"></a>01590 exec_begin = clock_timestamp();
<a name="l01591"></a>01591
<a name="l01592"></a>01592 -- generate the KV table for both continuous features
<a name="l01593"></a>01593 -- and discrete features.
<a name="l01594"></a>01594 PERFORM MADLIB_SCHEMA.__encode_columns
<a name="l01595"></a>01595 (
<a name="l01596"></a>01596 kv_attr_name,
<a name="l01597"></a>01597 breakup_tbl_name,
<a name="l01598"></a>01598 h2hmv_routine_id,
<a name="l01599"></a>01599 verbosity
<a name="l01600"></a>01600 );
<a name="l01601"></a>01601
<a name="l01602"></a>01602 ret.gen_kv_time = ret.gen_kv_time + (clock_timestamp() - exec_begin);
<a name="l01603"></a>01603 exec_begin = clock_timestamp();
<a name="l01604"></a>01604
<a name="l01605"></a>01605 -- generate the encoded table using the breakup table
<a name="l01606"></a>01606 -- and KV table for all the features.
<a name="l01607"></a>01607 PERFORM MADLIB_SCHEMA.__gen_vertical_encoded_table
<a name="l01608"></a>01608 (
<a name="l01609"></a>01609 breakup_tbl_name,
<a name="l01610"></a>01610 enc_table_name,
<a name="l01611"></a>01611 kv_attr_name,
<a name="l01612"></a>01612 &#39;f&#39;::BOOL,
<a name="l01613"></a>01613 verbosity
<a name="l01614"></a>01614 );
<a name="l01615"></a>01615
<a name="l01616"></a>01616 ret.gen_enc_time = clock_timestamp() - exec_begin;
<a name="l01617"></a>01617 exec_begin = clock_timestamp();
<a name="l01618"></a>01618
<a name="l01619"></a>01619 SELECT ARRAY(SELECT quote_literal(unnest(attr_col_names))) INTO lit_attr_col_names;
<a name="l01620"></a>01620
<a name="l01621"></a>01621 -- put the features&#39; meta information to the metatable
<a name="l01622"></a>01622 curstmt = MADLIB_SCHEMA.__format
<a name="l01623"></a>01623 (
<a name="l01624"></a>01624 &#39;INSERT INTO %
<a name="l01625"></a>01625 SELECT fid as <span class="keywordtype">id</span>, (ARRAY[%])[fid] as column_name,
<a name="l01626"></a>01626 &#39;&#39;f&#39;&#39; as column_type,
<a name="l01627"></a>01627 is_cont,
<a name="l01628"></a>01628 &#39;&#39;%&#39;&#39;::regclass::OID,
<a name="l01629"></a>01629 count(fid) as num_dist_value
<a name="l01630"></a>01630 FROM % t
<a name="l01631"></a>01631 GROUP BY fid, is_cont&#39;,
<a name="l01632"></a>01632 ARRAY[
<a name="l01633"></a>01633 meta_tbl_name,
<a name="l01634"></a>01634 array_to_string(lit_attr_col_names, &#39;,&#39;),
<a name="l01635"></a>01635 kv_attr_name,
<a name="l01636"></a>01636 &#39;tmp_dist_table&#39;
<a name="l01637"></a>01637 ]
<a name="l01638"></a>01638 );
<a name="l01639"></a>01639
<a name="l01640"></a>01640 EXECUTE curstmt;
<a name="l01641"></a>01641
<a name="l01642"></a>01642 IF (h2hmv_routine_id = 1) THEN
<a name="l01643"></a>01643 -- retrieve the information of the columns (all the values in those
<a name="l01644"></a>01644 -- columns are missing), and insert them to the meta table.
<a name="l01645"></a>01645 curstmt = MADLIB_SCHEMA.__format
<a name="l01646"></a>01646 (
<a name="l01647"></a>01647 &#39;INSERT INTO %
<a name="l01648"></a>01648 SELECT <span class="keywordtype">id</span>, (ARRAY[%])[<span class="keywordtype">id</span>] as column_name,
<a name="l01649"></a>01649 &#39;&#39;f&#39;&#39; as column_type, &#39;&#39;t&#39;&#39;, NULL, 0
<a name="l01650"></a>01650 FROM (
<a name="l01651"></a>01651 SELECT generate_series(1, %) <span class="keywordtype">id</span>
<a name="l01652"></a>01652 EXCEPT
<a name="l01653"></a>01653 SELECT <span class="keywordtype">id</span> FROM % WHERE column_type = &#39;&#39;f&#39;&#39;
<a name="l01654"></a>01654 ) t&#39;,
<a name="l01655"></a>01655 ARRAY[
<a name="l01656"></a>01656 meta_tbl_name,
<a name="l01657"></a>01657 array_to_string(lit_attr_col_names, &#39;,&#39;),
<a name="l01658"></a>01658 array_upper(attr_col_names, 1)::TEXT,
<a name="l01659"></a>01659 meta_tbl_name
<a name="l01660"></a>01660 ]
<a name="l01661"></a>01661 );
<a name="l01662"></a>01662 EXECUTE curstmt;
<a name="l01663"></a>01663 END IF;
<a name="l01664"></a>01664
<a name="l01665"></a>01665 -- no need this table
<a name="l01666"></a>01666 EXECUTE &#39;DROP TABLE IF EXISTS tmp_dist_table&#39;;
<a name="l01667"></a>01667
<a name="l01668"></a>01668 -- put the class&#39;s meta information to the metatable
<a name="l01669"></a>01669 curstmt = MADLIB_SCHEMA.__format
<a name="l01670"></a>01670 (
<a name="l01671"></a>01671 &#39;INSERT INTO %
<a name="l01672"></a>01672 SELECT 0 as <span class="keywordtype">id</span>,&#39;&#39;%&#39;&#39;,
<a name="l01673"></a>01673 &#39;&#39;c&#39;&#39; as column_type,
<a name="l01674"></a>01674 &#39;&#39;f&#39;&#39;::BOOL,
<a name="l01675"></a>01675 &#39;&#39;%&#39;&#39;::regclass::OID,
<a name="l01676"></a>01676 count(code) as num_dist_value
<a name="l01677"></a>01677 FROM % t
<a name="l01678"></a>01678 GROUP BY fid&#39;,
<a name="l01679"></a>01679 ARRAY[
<a name="l01680"></a>01680 meta_tbl_name,
<a name="l01681"></a>01681 cls_col_name,
<a name="l01682"></a>01682 kv_cls_name,
<a name="l01683"></a>01683 kv_cls_name
<a name="l01684"></a>01684 ]
<a name="l01685"></a>01685 );
<a name="l01686"></a>01686
<a name="l01687"></a>01687 EXECUTE curstmt;
<a name="l01688"></a>01688
<a name="l01689"></a>01689 -- put the <span class="keywordtype">id</span>&#39;s meta information to the metatable
<a name="l01690"></a>01690 PERFORM MADLIB_SCHEMA.__insert_into_metatable
<a name="l01691"></a>01691 (
<a name="l01692"></a>01692 meta_tbl_name,
<a name="l01693"></a>01693 array_upper(attr_col_names, 1) + 1,
<a name="l01694"></a>01694 id_col_name,
<a name="l01695"></a>01695 &#39;i&#39;, &#39;f&#39;, NULL, 0
<a name="l01696"></a>01696 );
<a name="l01697"></a>01697
<a name="l01698"></a>01698 -- analyze the table, so that later the optimizer has the statistics
<a name="l01699"></a>01699 -- information about this table
<a name="l01700"></a>01700 EXECUTE &#39;ANALYZE &#39; || enc_table_name;
<a name="l01701"></a>01701
<a name="l01702"></a>01702 ret.post_proc_time = clock_timestamp() - exec_begin;
<a name="l01703"></a>01703
<a name="l01704"></a>01704 IF (verbosity &gt; 0) THEN
<a name="l01705"></a>01705 RAISE INFO &#39;time of encoding: %&#39;, ret;
<a name="l01706"></a>01706 END IF;
<a name="l01707"></a>01707 END
<a name="l01708"></a>01708 $$ LANGUAGE PLPGSQL;
<a name="l01709"></a>01709
<a name="l01710"></a>01710
<a name="l01711"></a>01711 <span class="comment">/*</span>
<a name="l01712"></a>01712 <span class="comment"> * @brief Encode a table for classification/scoring.</span>
<a name="l01713"></a>01713 <span class="comment"> *</span>
<a name="l01714"></a>01714 <span class="comment"> * @param input_tbl_name The full name of the input table.</span>
<a name="l01715"></a>01715 <span class="comment"> * @param enc_tbl_name The full name of the encoded table.</span>
<a name="l01716"></a>01716 <span class="comment"> * @param meta_tbl_name The full name of the metatable.</span>
<a name="l01717"></a>01717 <span class="comment"> * @param h2hmv_routine_id The ID of the routine which specifies </span>
<a name="l01718"></a>01718 <span class="comment"> * how to handle missing value(h2hmv). </span>
<a name="l01719"></a>01719 <span class="comment"> * @param verbosity &gt; 0 means this function runs in verbose mode. </span>
<a name="l01720"></a>01720 <span class="comment"> *</span>
<a name="l01721"></a>01721 <span class="comment"> */</span>
<a name="l01722"></a>01722 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__encode_table
<a name="l01723"></a>01723 (
<a name="l01724"></a>01724 input_tbl_name TEXT,
<a name="l01725"></a>01725 enc_tbl_name TEXT,
<a name="l01726"></a>01726 meta_tbl_name TEXT,
<a name="l01727"></a>01727 h2hmv_routine_id INT,
<a name="l01728"></a>01728 verbosity INT
<a name="l01729"></a>01729 )
<a name="l01730"></a>01730 RETURNS VOID AS $$
<a name="l01731"></a>01731 DECLARE
<a name="l01732"></a>01732 curstmt TEXT;
<a name="l01733"></a>01733 attr_col_names TEXT[];
<a name="l01734"></a>01734 cls_col_name TEXT;
<a name="l01735"></a>01735 id_col_name TEXT;
<a name="l01736"></a>01736 kv_attr_name TEXT;
<a name="l01737"></a>01737 kv_cls_name TEXT;
<a name="l01738"></a>01738 is_conts BOOL[];
<a name="l01739"></a>01739 exec_begin TIMESTAMP;
<a name="l01740"></a>01740 breakup_tbl_name TEXT := &#39;tmp_breakup_table&#39;;
<a name="l01741"></a>01741 BEGIN
<a name="l01742"></a>01742 exec_begin = clock_timestamp();
<a name="l01743"></a>01743
<a name="l01744"></a>01744 curstmt = MADLIB_SCHEMA.__format
<a name="l01745"></a>01745 (
<a name="l01746"></a>01746 &#39;SELECT column_name FROM % WHERE column_type=&#39;&#39;i&#39;&#39;&#39;,
<a name="l01747"></a>01747 meta_tbl_name
<a name="l01748"></a>01748 );
<a name="l01749"></a>01749 EXECUTE curstmt INTO id_col_name;
<a name="l01750"></a>01750
<a name="l01751"></a>01751 curstmt = MADLIB_SCHEMA.__format
<a name="l01752"></a>01752 (
<a name="l01753"></a>01753 &#39;SELECT column_name FROM % WHERE column_type=&#39;&#39;c&#39;&#39;&#39;,
<a name="l01754"></a>01754 meta_tbl_name
<a name="l01755"></a>01755 );
<a name="l01756"></a>01756 EXECUTE curstmt INTO cls_col_name;
<a name="l01757"></a>01757
<a name="l01758"></a>01758 IF (NOT MADLIB_SCHEMA.__column_exists(input_tbl_name, cls_col_name)) THEN
<a name="l01759"></a>01759 cls_col_name = NULL;
<a name="l01760"></a>01760 END IF;
<a name="l01761"></a>01761
<a name="l01762"></a>01762 m4_changequote(`&gt;&gt;&gt;&#39;, `&lt;&lt;&lt;&#39;)
<a name="l01763"></a>01763 m4_ifdef(`__HAS_ORDERED_AGGREGATES__&#39;, &gt;&gt;&gt;
<a name="l01764"></a>01764 curstmt = MADLIB_SCHEMA.__format
<a name="l01765"></a>01765 (
<a name="l01766"></a>01766 &#39;SELECT <a class="code" href="array__ops_8sql__in.html#af9f60293134ba4ce05b1f97f6faba822" title="ARRAY_AGG aggregate for compatibility with GPDB &lt; 4.1 and Postgres &lt; 9.0 This is a slower solution th...">array_agg</a>(column_name order by <span class="keywordtype">id</span>)
<a name="l01767"></a>01767 FROM % WHERE column_type=&#39;&#39;f&#39;&#39;&#39;,
<a name="l01768"></a>01768 meta_tbl_name
<a name="l01769"></a>01769 );
<a name="l01770"></a>01770 EXECUTE curstmt INTO attr_col_names;
<a name="l01771"></a>01771
<a name="l01772"></a>01772 curstmt = MADLIB_SCHEMA.__format
<a name="l01773"></a>01773 (
<a name="l01774"></a>01774 &#39;SELECT
<a name="l01775"></a>01775 <a class="code" href="array__ops_8sql__in.html#af9f60293134ba4ce05b1f97f6faba822" title="ARRAY_AGG aggregate for compatibility with GPDB &lt; 4.1 and Postgres &lt; 9.0 This is a slower solution th...">array_agg</a>(is_cont order by <span class="keywordtype">id</span>)
<a name="l01776"></a>01776 FROM %
<a name="l01777"></a>01777 WHERE column_type=&#39;&#39;f&#39;&#39;&#39;,
<a name="l01778"></a>01778 meta_tbl_name
<a name="l01779"></a>01779 );
<a name="l01780"></a>01780 EXECUTE curstmt INTO is_conts;
<a name="l01781"></a>01781 &lt;&lt;&lt;, &gt;&gt;&gt;
<a name="l01782"></a>01782 curstmt = MADLIB_SCHEMA.__format
<a name="l01783"></a>01783 (
<a name="l01784"></a>01784 &#39;SELECT ARRAY
<a name="l01785"></a>01785 (
<a name="l01786"></a>01786 SELECT column_name
<a name="l01787"></a>01787 FROM % WHERE column_type=&#39;&#39;f&#39;&#39;
<a name="l01788"></a>01788 ORDER BY <span class="keywordtype">id</span>
<a name="l01789"></a>01789 LIMIT ALL
<a name="l01790"></a>01790 )&#39;,
<a name="l01791"></a>01791 meta_tbl_name
<a name="l01792"></a>01792 );
<a name="l01793"></a>01793 EXECUTE curstmt INTO attr_col_names;
<a name="l01794"></a>01794
<a name="l01795"></a>01795 curstmt = MADLIB_SCHEMA.__format
<a name="l01796"></a>01796 (
<a name="l01797"></a>01797 &#39;SELECT ARRAY
<a name="l01798"></a>01798 (
<a name="l01799"></a>01799 SELECT is_cont
<a name="l01800"></a>01800 FROM %
<a name="l01801"></a>01801 WHERE column_type=&#39;&#39;f&#39;&#39;
<a name="l01802"></a>01802 ORDER BY <span class="keywordtype">id</span>
<a name="l01803"></a>01803 LIMIT ALL
<a name="l01804"></a>01804 )&#39;,
<a name="l01805"></a>01805 meta_tbl_name
<a name="l01806"></a>01806 );
<a name="l01807"></a>01807 EXECUTE curstmt INTO is_conts;
<a name="l01808"></a>01808 &lt;&lt;&lt;)
<a name="l01809"></a>01809 m4_changequote(&gt;&gt;&gt;`&lt;&lt;&lt;, &gt;&gt;&gt;&#39;&lt;&lt;&lt;)
<a name="l01810"></a>01810
<a name="l01811"></a>01811 curstmt = MADLIB_SCHEMA.__format
<a name="l01812"></a>01812 (
<a name="l01813"></a>01813 &#39;SELECT
<a name="l01814"></a>01814 MADLIB_SCHEMA.__regclass_to_text(table_oid) as table_name
<a name="l01815"></a>01815 FROM %
<a name="l01816"></a>01816 WHERE column_type=&#39;&#39;f&#39;&#39; limit 1&#39;,
<a name="l01817"></a>01817 meta_tbl_name
<a name="l01818"></a>01818 );
<a name="l01819"></a>01819 EXECUTE curstmt INTO kv_attr_name;
<a name="l01820"></a>01820
<a name="l01821"></a>01821 curstmt = MADLIB_SCHEMA.__format
<a name="l01822"></a>01822 (
<a name="l01823"></a>01823 &#39;SELECT
<a name="l01824"></a>01824 MADLIB_SCHEMA.__regclass_to_text(table_oid) as table_name
<a name="l01825"></a>01825 FROM %
<a name="l01826"></a>01826 WHERE column_type=&#39;&#39;c&#39;&#39; limit 1&#39;,
<a name="l01827"></a>01827 meta_tbl_name
<a name="l01828"></a>01828 );
<a name="l01829"></a>01829 EXECUTE curstmt INTO kv_cls_name;
<a name="l01830"></a>01830
<a name="l01831"></a>01831 PERFORM MADLIB_SCHEMA.__validate_input_table
<a name="l01832"></a>01832 (
<a name="l01833"></a>01833 input_tbl_name,
<a name="l01834"></a>01834 NULL,
<a name="l01835"></a>01835 id_col_name,
<a name="l01836"></a>01836 NULL
<a name="l01837"></a>01837 );
<a name="l01838"></a>01838
<a name="l01839"></a>01839 -- breakup each record from the classification/scoring
<a name="l01840"></a>01840 -- table and kept the results into a new table.
<a name="l01841"></a>01841 PERFORM MADLIB_SCHEMA.__breakup_table
<a name="l01842"></a>01842 (
<a name="l01843"></a>01843 input_tbl_name,
<a name="l01844"></a>01844 breakup_tbl_name,
<a name="l01845"></a>01845 kv_cls_name,
<a name="l01846"></a>01846 id_col_name,
<a name="l01847"></a>01847 cls_col_name,
<a name="l01848"></a>01848 attr_col_names,
<a name="l01849"></a>01849 is_conts,
<a name="l01850"></a>01850 h2hmv_routine_id,
<a name="l01851"></a>01851 verbosity
<a name="l01852"></a>01852 );
<a name="l01853"></a>01853
<a name="l01854"></a>01854 -- generate the vertical encoded table.
<a name="l01855"></a>01855 PERFORM MADLIB_SCHEMA.__gen_vertical_encoded_table
<a name="l01856"></a>01856 (
<a name="l01857"></a>01857 breakup_tbl_name,
<a name="l01858"></a>01858 &#39;dt_tmp_ver_table&#39;,
<a name="l01859"></a>01859 kv_attr_name,
<a name="l01860"></a>01860 &#39;t&#39;::BOOL,
<a name="l01861"></a>01861 verbosity
<a name="l01862"></a>01862 );
<a name="l01863"></a>01863
<a name="l01864"></a>01864 -- generate the horizontal encoded table.
<a name="l01865"></a>01865 EXECUTE &#39;DROP TABLE IF EXISTS &#39; || enc_tbl_name;
<a name="l01866"></a>01866 PERFORM MADLIB_SCHEMA.__gen_horizontal_encoded_table
<a name="l01867"></a>01867 (
<a name="l01868"></a>01868 enc_tbl_name,
<a name="l01869"></a>01869 &#39;dt_tmp_ver_table&#39;,
<a name="l01870"></a>01870 array_upper(is_conts, 1),
<a name="l01871"></a>01871 verbosity
<a name="l01872"></a>01872 );
<a name="l01873"></a>01873
<a name="l01874"></a>01874 EXECUTE &#39;DROP TABLE IF EXISTS dt_tmp_ver_table&#39;;
<a name="l01875"></a>01875
<a name="l01876"></a>01876 IF (verbosity &gt; 0) THEN
<a name="l01877"></a>01877 RAISE INFO &#39;Encoding time:%&#39;, clock_timestamp() - exec_begin;
<a name="l01878"></a>01878 END IF;
<a name="l01879"></a>01879 END
<a name="l01880"></a>01880 $$ LANGUAGE PLPGSQL;
</pre></div></div>
</div>
<div id="nav-path" class="navpath">
<ul>
<li class="navelem"><a class="el" href="dt__preproc_8sql__in.html">dt_preproc.sql_in</a> </li>
<!-- window showing the filter options -->
<div id="MSearchSelectWindow"
onmouseover="return searchBox.OnSearchSelectShow()"
onmouseout="return searchBox.OnSearchSelectHide()"
onkeydown="return searchBox.OnSearchSelectKey(event)">
<a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(0)"><span class="SelectionMark">&#160;</span>All</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(1)"><span class="SelectionMark">&#160;</span>Files</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(2)"><span class="SelectionMark">&#160;</span>Functions</a></div>
<!-- iframe showing the search results (closed by default) -->
<div id="MSearchResultsWindow">
<iframe src="javascript:void(0)" frameborder="0"
name="MSearchResults" id="MSearchResults">
</iframe>
</div>
<li class="footer">Generated on Tue Apr 2 2013 14:57:03 for MADlib by
<a href="http://www.doxygen.org/index.html">
<img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.7.5.1 </li>
</ul>
</div>
</body>
</html>