blob: bb565de172e8da56327138baf7cb2bea82795b26 [file] [log] [blame]
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
<title>MADlib: bayes.sql_in Source File</title>
<link href="tabs.css" rel="stylesheet" type="text/css"/>
<link href="doxygen.css" rel="stylesheet" type="text/css" />
<link href="navtree.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript" src="jquery.js"></script>
<script type="text/javascript" src="resize.js"></script>
<script type="text/javascript" src="navtree.js"></script>
<script type="text/javascript">
$(document).ready(initResizable);
</script>
<link href="search/search.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript" src="search/search.js"></script>
<script type="text/javascript">
$(document).ready(function() { searchBox.OnSelectItem(0); });
</script>
<script src="../mathjax/MathJax.js">
MathJax.Hub.Config({
extensions: ["tex2jax.js", "TeX/AMSmath.js", "TeX/AMSsymbols.js"],
jax: ["input/TeX","output/HTML-CSS"],
});
</script>
</head>
<body>
<div id="top"><!-- do not remove this div! -->
<div id="titlearea">
<table cellspacing="0" cellpadding="0">
<tbody>
<tr style="height: 56px;">
<td style="padding-left: 0.5em;">
<div id="projectname">MADlib
&#160;<span id="projectnumber">0.6</span> <span style="font-size:10pt; font-style:italic"><a href="../latest/./bayes_8sql__in_source.html"> A newer version is available</a></span>
</div>
<div id="projectbrief">User Documentation</div>
</td>
</tr>
</tbody>
</table>
</div>
<!-- Generated by Doxygen 1.7.5.1 -->
<script type="text/javascript">
var searchBox = new SearchBox("searchBox", "search",false,'Search');
</script>
<script type="text/javascript" src="dynsections.js"></script>
<div id="navrow1" class="tabs">
<ul class="tablist">
<li><a href="index.html"><span>Main&#160;Page</span></a></li>
<li><a href="modules.html"><span>Modules</span></a></li>
<li class="current"><a href="files.html"><span>Files</span></a></li>
<li>
<div id="MSearchBox" class="MSearchBoxInactive">
<span class="left">
<img id="MSearchSelect" src="search/mag_sel.png"
onmouseover="return searchBox.OnSearchSelectShow()"
onmouseout="return searchBox.OnSearchSelectHide()"
alt=""/>
<input type="text" id="MSearchField" value="Search" accesskey="S"
onfocus="searchBox.OnSearchFieldFocus(true)"
onblur="searchBox.OnSearchFieldFocus(false)"
onkeyup="searchBox.OnSearchFieldChange(event)"/>
</span><span class="right">
<a id="MSearchClose" href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" border="0" src="search/close.png" alt=""/></a>
</span>
</div>
</li>
</ul>
</div>
<div id="navrow2" class="tabs2">
<ul class="tablist">
<li><a href="files.html"><span>File&#160;List</span></a></li>
<li><a href="globals.html"><span>File&#160;Members</span></a></li>
</ul>
</div>
</div>
<div id="side-nav" class="ui-resizable side-nav-resizable">
<div id="nav-tree">
<div id="nav-tree-contents">
</div>
</div>
<div id="splitbar" style="-moz-user-select:none;"
class="ui-resizable-handle">
</div>
</div>
<script type="text/javascript">
initNavTree('bayes_8sql__in.html','');
</script>
<div id="doc-content">
<div class="header">
<div class="headertitle">
<div class="title">bayes.sql_in</div> </div>
</div>
<div class="contents">
<a href="bayes_8sql__in.html">Go to the documentation of this file.</a><div class="fragment"><pre class="fragment"><a name="l00001"></a>00001 <span class="comment">/* ----------------------------------------------------------------------- */</span><span class="comment">/**</span>
<a name="l00002"></a>00002 <span class="comment"> *</span>
<a name="l00003"></a>00003 <span class="comment"> * @file bayes.sql_in</span>
<a name="l00004"></a>00004 <span class="comment"> *</span>
<a name="l00005"></a>00005 <span class="comment"> * @brief SQL functions for naive Bayes</span>
<a name="l00006"></a>00006 <span class="comment"> * @date January 2011</span>
<a name="l00007"></a>00007 <span class="comment"> *</span>
<a name="l00008"></a>00008 <span class="comment"> * @sa For a brief introduction to Naive Bayes Classification, see the module</span>
<a name="l00009"></a>00009 <span class="comment"> * description \ref grp_bayes.</span>
<a name="l00010"></a>00010 <span class="comment"> *</span>
<a name="l00011"></a>00011 <span class="comment"> */</span><span class="comment">/* ----------------------------------------------------------------------- */</span>
<a name="l00012"></a>00012
<a name="l00013"></a>00013 m4_include(`SQLCommon.m4<span class="stringliteral">&#39;)</span>
<a name="l00014"></a>00014 <span class="stringliteral"></span><span class="comment"></span>
<a name="l00015"></a>00015 <span class="comment">/**</span>
<a name="l00016"></a>00016 <span class="comment">@addtogroup grp_bayes</span>
<a name="l00017"></a>00017 <span class="comment"></span>
<a name="l00018"></a>00018 <span class="comment">@about</span>
<a name="l00019"></a>00019 <span class="comment"></span>
<a name="l00020"></a>00020 <span class="comment">Naive Bayes refers to a stochastic model where all independent variables</span>
<a name="l00021"></a>00021 <span class="comment">\f$ a_1, \dots, a_n \f$ (often referred to as attributes in this context)</span>
<a name="l00022"></a>00022 <span class="comment">independently contribute to the probability that a data point belongs to a</span>
<a name="l00023"></a>00023 <span class="comment">certain class \f$ c \f$. In detail, \b Bayes&#39; theorem states that</span>
<a name="l00024"></a>00024 <span class="comment">\f[</span>
<a name="l00025"></a>00025 <span class="comment"> \Pr(C = c \mid A_1 = a_1, \dots, A_n = a_n)</span>
<a name="l00026"></a>00026 <span class="comment"> = \frac{\Pr(C = c) \cdot \Pr(A_1 = a_1, \dots, A_n = a_n \mid C = c)}</span>
<a name="l00027"></a>00027 <span class="comment"> {\Pr(A_1 = a_1, \dots, A_n = a_n)}</span>
<a name="l00028"></a>00028 <span class="comment"> \,,</span>
<a name="l00029"></a>00029 <span class="comment">\f]</span>
<a name="l00030"></a>00030 <span class="comment">and the \b naive assumption is that</span>
<a name="l00031"></a>00031 <span class="comment">\f[</span>
<a name="l00032"></a>00032 <span class="comment"> \Pr(A_1 = a_1, \dots, A_n = a_n \mid C = c)</span>
<a name="l00033"></a>00033 <span class="comment"> = \prod_{i=1}^n \Pr(A_i = a_i \mid C = c)</span>
<a name="l00034"></a>00034 <span class="comment"> \,.</span>
<a name="l00035"></a>00035 <span class="comment">\f]</span>
<a name="l00036"></a>00036 <span class="comment">Naives Bayes classification estimates feature probabilities and class priors</span>
<a name="l00037"></a>00037 <span class="comment">using maximum likelihood or Laplacian smoothing. These parameters are then used</span>
<a name="l00038"></a>00038 <span class="comment">to classifying new data.</span>
<a name="l00039"></a>00039 <span class="comment"></span>
<a name="l00040"></a>00040 <span class="comment">A Naive Bayes classifier computes the following formula:</span>
<a name="l00041"></a>00041 <span class="comment">\f[</span>
<a name="l00042"></a>00042 <span class="comment"> \text{classify}(a_1, ..., a_n)</span>
<a name="l00043"></a>00043 <span class="comment"> = \arg\max_c \left\{</span>
<a name="l00044"></a>00044 <span class="comment"> \Pr(C = c) \cdot \prod_{i=1}^n \Pr(A_i = a_i \mid C = c)</span>
<a name="l00045"></a>00045 <span class="comment"> \right\}</span>
<a name="l00046"></a>00046 <span class="comment">\f]</span>
<a name="l00047"></a>00047 <span class="comment">where \f$ c \f$ ranges over all classes in the training data and probabilites</span>
<a name="l00048"></a>00048 <span class="comment">are estimated with relative frequencies from the training set.</span>
<a name="l00049"></a>00049 <span class="comment">There are different ways to estimate the feature probabilities</span>
<a name="l00050"></a>00050 <span class="comment">\f$ P(A_i = a \mid C = c) \f$. The maximum likelihood estimate takes the</span>
<a name="l00051"></a>00051 <span class="comment">relative frequencies. That is:</span>
<a name="l00052"></a>00052 <span class="comment">\f[</span>
<a name="l00053"></a>00053 <span class="comment"> P(A_i = a \mid C = c) = \frac{\#(c,i,a)}{\#c}</span>
<a name="l00054"></a>00054 <span class="comment">\f]</span>
<a name="l00055"></a>00055 <span class="comment">where</span>
<a name="l00056"></a>00056 <span class="comment">- \f$ \#(c,i,a) \f$ denotes the # of training samples where attribute \f$ i \f$</span>
<a name="l00057"></a>00057 <span class="comment"> is \f$ a \f$ and class is \f$ c \f$</span>
<a name="l00058"></a>00058 <span class="comment">- \f$ \#c \f$ denotes the # of training samples where class is \f$ c \f$.</span>
<a name="l00059"></a>00059 <span class="comment"></span>
<a name="l00060"></a>00060 <span class="comment">Since the maximum likelihood sometimes results in estimates of &quot;0&quot;, you might</span>
<a name="l00061"></a>00061 <span class="comment">want to use a &quot;smoothed&quot; estimate. To do this, you add a number of &quot;virtual&quot;</span>
<a name="l00062"></a>00062 <span class="comment">samples and make the assumption that these samples are evenly distributed among</span>
<a name="l00063"></a>00063 <span class="comment">the values assumed by attribute \f$ i \f$ (that is, the set of all values</span>
<a name="l00064"></a>00064 <span class="comment">observed for attribute \f$ a \f$ for any class):</span>
<a name="l00065"></a>00065 <span class="comment"></span>
<a name="l00066"></a>00066 <span class="comment">\f[</span>
<a name="l00067"></a>00067 <span class="comment"> P(A_i = a \mid C = c) = \frac{\#(c,i,a) + s}{\#c + s \cdot \#i}</span>
<a name="l00068"></a>00068 <span class="comment">\f]</span>
<a name="l00069"></a>00069 <span class="comment">where</span>
<a name="l00070"></a>00070 <span class="comment">- \f$ \#i \f$ denotes the # of distinct values for attribute \f$ i \f$ (for all</span>
<a name="l00071"></a>00071 <span class="comment"> classes)</span>
<a name="l00072"></a>00072 <span class="comment">- \f$ s \geq 0 \f$ denotes the smoothing factor.</span>
<a name="l00073"></a>00073 <span class="comment"></span>
<a name="l00074"></a>00074 <span class="comment">The case \f$ s = 1 \f$ is known as &quot;Laplace smoothing&quot;. The case \f$ s = 0 \f$</span>
<a name="l00075"></a>00075 <span class="comment">trivially reduces to maximum-likelihood estimates.</span>
<a name="l00076"></a>00076 <span class="comment"></span>
<a name="l00077"></a>00077 <span class="comment">\b Note:</span>
<a name="l00078"></a>00078 <span class="comment">(1) The probabilities computed on the platforms of PostgreSQL and Greenplum</span>
<a name="l00079"></a>00079 <span class="comment">database have a small difference due to the nature of floating point</span>
<a name="l00080"></a>00080 <span class="comment">computation. Usually this is not important. However, if a data point has</span>
<a name="l00081"></a>00081 <span class="comment">\f[</span>
<a name="l00082"></a>00082 <span class="comment">P(C=c_i \mid A) \approx P(C=c_j \mid A)</span>
<a name="l00083"></a>00083 <span class="comment">\f]</span>
<a name="l00084"></a>00084 <span class="comment">for two classes, this data point might be classified into diferent classes on</span>
<a name="l00085"></a>00085 <span class="comment">PostgreSQL and Greenplum. This leads to the differences in classifications</span>
<a name="l00086"></a>00086 <span class="comment">on PostgreSQL and Greenplum for some data sets, but this should not</span>
<a name="l00087"></a>00087 <span class="comment">affect the quality of the results.</span>
<a name="l00088"></a>00088 <span class="comment"></span>
<a name="l00089"></a>00089 <span class="comment">(2) When two classes have equal and highest probability among all classes,</span>
<a name="l00090"></a>00090 <span class="comment">the classification result is an array of these two classes, but the order</span>
<a name="l00091"></a>00091 <span class="comment">of the two classes is random.</span>
<a name="l00092"></a>00092 <span class="comment"></span>
<a name="l00093"></a>00093 <span class="comment">(3) The current implementation of Naive Bayes classification is only suitable</span>
<a name="l00094"></a>00094 <span class="comment">for discontinuous (categorial) attributes.</span>
<a name="l00095"></a>00095 <span class="comment"></span>
<a name="l00096"></a>00096 <span class="comment">For continuous data, a typical assumption, usually used for small datasets,</span>
<a name="l00097"></a>00097 <span class="comment">is that the continuous values associated with each class are distributed</span>
<a name="l00098"></a>00098 <span class="comment">according to a Gaussian distribution,</span>
<a name="l00099"></a>00099 <span class="comment">and then the probabilities \f$ P(A_i = a \mid C=c) \f$ can be estimated.</span>
<a name="l00100"></a>00100 <span class="comment">Another common technique for handling continuous values, which is better for</span>
<a name="l00101"></a>00101 <span class="comment">large data sets, is to use binning to discretize the values, and convert the</span>
<a name="l00102"></a>00102 <span class="comment">continuous data into categorical bins. These approaches are currently not</span>
<a name="l00103"></a>00103 <span class="comment">implemented and planned for future releases.</span>
<a name="l00104"></a>00104 <span class="comment"></span>
<a name="l00105"></a>00105 <span class="comment">(4) One can still provide floating point data to the naive Bayes</span>
<a name="l00106"></a>00106 <span class="comment">classification function. Floating point numbers can be used as symbolic</span>
<a name="l00107"></a>00107 <span class="comment">substitutions for categorial data. The classification would work best if</span>
<a name="l00108"></a>00108 <span class="comment">there are sufficient data points for each floating point attribute. However,</span>
<a name="l00109"></a>00109 <span class="comment">if floating point numbers are used as continuous data, no warning is raised and</span>
<a name="l00110"></a>00110 <span class="comment">the result may not be as expected.</span>
<a name="l00111"></a>00111 <span class="comment"></span>
<a name="l00112"></a>00112 <span class="comment">@input</span>
<a name="l00113"></a>00113 <span class="comment"></span>
<a name="l00114"></a>00114 <span class="comment">The &lt;b&gt;training data&lt;/b&gt; is expected to be of the following form:</span>
<a name="l00115"></a>00115 <span class="comment">&lt;pre&gt;{TABLE|VIEW} &lt;em&gt;trainingSource&lt;/em&gt; (</span>
<a name="l00116"></a>00116 <span class="comment"> ...</span>
<a name="l00117"></a>00117 <span class="comment"> &lt;em&gt;trainingClassColumn&lt;/em&gt; INTEGER,</span>
<a name="l00118"></a>00118 <span class="comment"> &lt;em&gt;trainingAttrColumn&lt;/em&gt; INTEGER[],</span>
<a name="l00119"></a>00119 <span class="comment"> ...</span>
<a name="l00120"></a>00120 <span class="comment">)&lt;/pre&gt;</span>
<a name="l00121"></a>00121 <span class="comment"></span>
<a name="l00122"></a>00122 <span class="comment">The &lt;b&gt;data to classify&lt;/b&gt; is expected to be of the following form:</span>
<a name="l00123"></a>00123 <span class="comment">&lt;pre&gt;{TABLE|VIEW} &lt;em&gt;classifySource&lt;/em&gt; (</span>
<a name="l00124"></a>00124 <span class="comment"> ...</span>
<a name="l00125"></a>00125 <span class="comment"> &lt;em&gt;classifyKeyColumn&lt;/em&gt; ANYTYPE,</span>
<a name="l00126"></a>00126 <span class="comment"> &lt;em&gt;classifyAttrColumn&lt;/em&gt; INTEGER[],</span>
<a name="l00127"></a>00127 <span class="comment"> ...</span>
<a name="l00128"></a>00128 <span class="comment">)&lt;/pre&gt;</span>
<a name="l00129"></a>00129 <span class="comment"></span>
<a name="l00130"></a>00130 <span class="comment">@usage</span>
<a name="l00131"></a>00131 <span class="comment"></span>
<a name="l00132"></a>00132 <span class="comment">- Precompute feature probabilities and class priors:</span>
<a name="l00133"></a>00133 <span class="comment"> &lt;pre&gt;SELECT \ref create_nb_prepared_data_tables(</span>
<a name="l00134"></a>00134 <span class="comment"> &#39;&lt;em&gt;trainingSource&lt;/em&gt;&#39;, &#39;&lt;em&gt;trainingClassColumn&lt;/em&gt;&#39;, &#39;&lt;em&gt;trainingAttrColumn&lt;/em&gt;&#39;,</span>
<a name="l00135"></a>00135 <span class="comment"> &lt;em&gt;numAttrs&lt;/em&gt;, &#39;&lt;em&gt;featureProbsName&lt;/em&gt;&#39;, &#39;&lt;em&gt;classPriorsName&lt;/em&gt;&#39;</span>
<a name="l00136"></a>00136 <span class="comment"> );&lt;/pre&gt;</span>
<a name="l00137"></a>00137 <span class="comment"> This creates table &lt;em&gt;featureProbsName&lt;/em&gt; for storing feature</span>
<a name="l00138"></a>00138 <span class="comment"> probabilities and table &lt;em&gt;classPriorsName&lt;/em&gt; for storing the class priors.</span>
<a name="l00139"></a>00139 <span class="comment">- Perform Naive Bayes classification:</span>
<a name="l00140"></a>00140 <span class="comment"> &lt;pre&gt;SELECT \ref create_nb_classify_view(</span>
<a name="l00141"></a>00141 <span class="comment"> &#39;&lt;em&gt;featureProbsName&lt;/em&gt;&#39;, &#39;&lt;em&gt;classPriorsName&lt;/em&gt;&#39;,</span>
<a name="l00142"></a>00142 <span class="comment"> &#39;&lt;em&gt;classifySource&lt;/em&gt;&#39;, &#39;&lt;em&gt;classifyKeyColumn&lt;/em&gt;&#39;, &#39;&lt;em&gt;classifyAttrColumn&lt;/em&gt;&#39;,</span>
<a name="l00143"></a>00143 <span class="comment"> &lt;em&gt;numAttrs&lt;/em&gt;, &#39;&lt;em&gt;destName&lt;/em&gt;&#39;</span>
<a name="l00144"></a>00144 <span class="comment"> );&lt;/pre&gt;</span>
<a name="l00145"></a>00145 <span class="comment"> This creates the view &lt;tt&gt;&lt;em&gt;destName&lt;/em&gt;&lt;/tt&gt; mapping</span>
<a name="l00146"></a>00146 <span class="comment"> &lt;em&gt;classifyKeyColumn&lt;/em&gt; to the Naive Bayes classification:</span>
<a name="l00147"></a>00147 <span class="comment"> &lt;pre&gt;key | nb_classification</span>
<a name="l00148"></a>00148 <span class="comment">----+------------------</span>
<a name="l00149"></a>00149 <span class="comment">...&lt;/pre&gt;</span>
<a name="l00150"></a>00150 <span class="comment">- Compute Naive Bayes probabilities:</span>
<a name="l00151"></a>00151 <span class="comment"> &lt;pre&gt;SELECT \ref create_nb_probs_view(</span>
<a name="l00152"></a>00152 <span class="comment"> &#39;&lt;em&gt;featureProbsName&lt;/em&gt;&#39;, &#39;&lt;em&gt;classPriorsName&lt;/em&gt;&#39;,</span>
<a name="l00153"></a>00153 <span class="comment"> &#39;&lt;em&gt;classifySource&lt;/em&gt;&#39;, &#39;&lt;em&gt;classifyKeyColumn&lt;/em&gt;&#39;, &#39;&lt;em&gt;classifyAttrColumn&lt;/em&gt;&#39;,</span>
<a name="l00154"></a>00154 <span class="comment"> &lt;em&gt;numAttrs&lt;/em&gt;, &#39;&lt;em&gt;destName&lt;/em&gt;&#39;</span>
<a name="l00155"></a>00155 <span class="comment">);&lt;/pre&gt;</span>
<a name="l00156"></a>00156 <span class="comment"> This creates the view &lt;tt&gt;&lt;em&gt;destName&lt;/em&gt;&lt;/tt&gt; mapping</span>
<a name="l00157"></a>00157 <span class="comment"> &lt;em&gt;classifyKeyColumn&lt;/em&gt; and every single class to the Naive Bayes</span>
<a name="l00158"></a>00158 <span class="comment"> probability:</span>
<a name="l00159"></a>00159 <span class="comment"> &lt;pre&gt;key | class | nb_prob</span>
<a name="l00160"></a>00160 <span class="comment">----+-------+--------</span>
<a name="l00161"></a>00161 <span class="comment">...&lt;/pre&gt;</span>
<a name="l00162"></a>00162 <span class="comment">- Ad-hoc execution (no precomputation):</span>
<a name="l00163"></a>00163 <span class="comment"> Functions \ref create_nb_classify_view and</span>
<a name="l00164"></a>00164 <span class="comment"> \ref create_nb_probs_view can be used in an ad-hoc fashion without the above</span>
<a name="l00165"></a>00165 <span class="comment"> precomputation step. In this case, replace the function arguments</span>
<a name="l00166"></a>00166 <span class="comment"> &lt;pre&gt;&#39;&lt;em&gt;featureProbsName&lt;/em&gt;&#39;, &#39;&lt;em&gt;classPriorsName&lt;/em&gt;&#39;&lt;/pre&gt;</span>
<a name="l00167"></a>00167 <span class="comment"> with</span>
<a name="l00168"></a>00168 <span class="comment"> &lt;pre&gt;&#39;&lt;em&gt;trainingSource&lt;/em&gt;&#39;, &#39;&lt;em&gt;trainingClassColumn&lt;/em&gt;&#39;, &#39;&lt;em&gt;trainingAttrColumn&lt;/em&gt;&#39;&lt;/pre&gt;</span>
<a name="l00169"></a>00169 <span class="comment"></span>
<a name="l00170"></a>00170 <span class="comment">@examp</span>
<a name="l00171"></a>00171 <span class="comment"></span>
<a name="l00172"></a>00172 <span class="comment">The following is an extremely simplified example of the above option #1 which</span>
<a name="l00173"></a>00173 <span class="comment">can by verified by hand.</span>
<a name="l00174"></a>00174 <span class="comment"></span>
<a name="l00175"></a>00175 <span class="comment">-# The training and the classification data:</span>
<a name="l00176"></a>00176 <span class="comment">\verbatim</span>
<a name="l00177"></a>00177 <span class="comment">sql&gt; SELECT * FROM training;</span>
<a name="l00178"></a>00178 <span class="comment"> id | class | attributes</span>
<a name="l00179"></a>00179 <span class="comment">----+-------+------------</span>
<a name="l00180"></a>00180 <span class="comment"> 1 | 1 | {1,2,3}</span>
<a name="l00181"></a>00181 <span class="comment"> 2 | 1 | {1,2,1}</span>
<a name="l00182"></a>00182 <span class="comment"> 3 | 1 | {1,4,3}</span>
<a name="l00183"></a>00183 <span class="comment"> 4 | 2 | {1,2,2}</span>
<a name="l00184"></a>00184 <span class="comment"> 5 | 2 | {0,2,2}</span>
<a name="l00185"></a>00185 <span class="comment"> 6 | 2 | {0,1,3}</span>
<a name="l00186"></a>00186 <span class="comment">(6 rows)</span>
<a name="l00187"></a>00187 <span class="comment"></span>
<a name="l00188"></a>00188 <span class="comment">sql&gt; select * from toclassify;</span>
<a name="l00189"></a>00189 <span class="comment"> id | attributes</span>
<a name="l00190"></a>00190 <span class="comment">----+------------</span>
<a name="l00191"></a>00191 <span class="comment"> 1 | {0,2,1}</span>
<a name="l00192"></a>00192 <span class="comment"> 2 | {1,2,3}</span>
<a name="l00193"></a>00193 <span class="comment">(2 rows)</span>
<a name="l00194"></a>00194 <span class="comment">\endverbatim</span>
<a name="l00195"></a>00195 <span class="comment">-# Precompute feature probabilities and class priors</span>
<a name="l00196"></a>00196 <span class="comment">\verbatim</span>
<a name="l00197"></a>00197 <span class="comment">sql&gt; SELECT madlib.create_nb_prepared_data_tables(</span>
<a name="l00198"></a>00198 <span class="comment">&#39;training&#39;, &#39;class&#39;, &#39;attributes&#39;, 3, &#39;nb_feature_probs&#39;, &#39;nb_class_priors&#39;);</span>
<a name="l00199"></a>00199 <span class="comment">\endverbatim</span>
<a name="l00200"></a>00200 <span class="comment">-# Optionally check the contents of the precomputed tables:</span>
<a name="l00201"></a>00201 <span class="comment">\verbatim</span>
<a name="l00202"></a>00202 <span class="comment">sql&gt; SELECT * FROM nb_class_priors;</span>
<a name="l00203"></a>00203 <span class="comment"> class | class_cnt | all_cnt</span>
<a name="l00204"></a>00204 <span class="comment">-------+-----------+---------</span>
<a name="l00205"></a>00205 <span class="comment"> 1 | 3 | 6</span>
<a name="l00206"></a>00206 <span class="comment"> 2 | 3 | 6</span>
<a name="l00207"></a>00207 <span class="comment">(2 rows)</span>
<a name="l00208"></a>00208 <span class="comment"></span>
<a name="l00209"></a>00209 <span class="comment">sql&gt; SELECT * FROM nb_feature_probs;</span>
<a name="l00210"></a>00210 <span class="comment"> class | attr | value | cnt | attr_cnt</span>
<a name="l00211"></a>00211 <span class="comment">-------+------+-------+-----+----------</span>
<a name="l00212"></a>00212 <span class="comment"> 1 | 1 | 0 | 0 | 2</span>
<a name="l00213"></a>00213 <span class="comment"> 1 | 1 | 1 | 3 | 2</span>
<a name="l00214"></a>00214 <span class="comment"> 1 | 2 | 1 | 0 | 3</span>
<a name="l00215"></a>00215 <span class="comment"> 1 | 2 | 2 | 2 | 3</span>
<a name="l00216"></a>00216 <span class="comment">...</span>
<a name="l00217"></a>00217 <span class="comment">\endverbatim</span>
<a name="l00218"></a>00218 <span class="comment">-# Create the view with Naive Bayes classification and check the results:</span>
<a name="l00219"></a>00219 <span class="comment">\verbatim</span>
<a name="l00220"></a>00220 <span class="comment">sql&gt; SELECT madlib.create_nb_classify_view (</span>
<a name="l00221"></a>00221 <span class="comment">&#39;nb_feature_probs&#39;, &#39;nb_class_priors&#39;, &#39;toclassify&#39;, &#39;id&#39;, &#39;attributes&#39;, 3, &#39;nb_classify_view_fast&#39;);</span>
<a name="l00222"></a>00222 <span class="comment"></span>
<a name="l00223"></a>00223 <span class="comment">sql&gt; SELECT * FROM nb_classify_view_fast;</span>
<a name="l00224"></a>00224 <span class="comment"> key | nb_classification</span>
<a name="l00225"></a>00225 <span class="comment">-----+-------------------</span>
<a name="l00226"></a>00226 <span class="comment"> 1 | {2}</span>
<a name="l00227"></a>00227 <span class="comment"> 2 | {1}</span>
<a name="l00228"></a>00228 <span class="comment">(2 rows)</span>
<a name="l00229"></a>00229 <span class="comment">\endverbatim</span>
<a name="l00230"></a>00230 <span class="comment">-# Look at the probabilities for each class (note that we use &quot;Laplacian smoothing&quot;):</span>
<a name="l00231"></a>00231 <span class="comment">\verbatim</span>
<a name="l00232"></a>00232 <span class="comment">sql&gt; SELECT madlib.create_nb_probs_view (</span>
<a name="l00233"></a>00233 <span class="comment">&#39;nb_feature_probs&#39;, &#39;nb_class_priors&#39;, &#39;toclassify&#39;, &#39;id&#39;, &#39;attributes&#39;, 3, &#39;nb_probs_view_fast&#39;);</span>
<a name="l00234"></a>00234 <span class="comment"></span>
<a name="l00235"></a>00235 <span class="comment">sql&gt; SELECT * FROM nb_probs_view_fast;</span>
<a name="l00236"></a>00236 <span class="comment"> key | class | nb_prob</span>
<a name="l00237"></a>00237 <span class="comment">-----+-------+---------</span>
<a name="l00238"></a>00238 <span class="comment"> 1 | 1 | 0.4</span>
<a name="l00239"></a>00239 <span class="comment"> 1 | 2 | 0.6</span>
<a name="l00240"></a>00240 <span class="comment"> 2 | 1 | 0.75</span>
<a name="l00241"></a>00241 <span class="comment"> 2 | 2 | 0.25</span>
<a name="l00242"></a>00242 <span class="comment">(4 rows)</span>
<a name="l00243"></a>00243 <span class="comment">\endverbatim</span>
<a name="l00244"></a>00244 <span class="comment"></span>
<a name="l00245"></a>00245 <span class="comment">@literature</span>
<a name="l00246"></a>00246 <span class="comment"></span>
<a name="l00247"></a>00247 <span class="comment">[1] Tom Mitchell: Machine Learning, McGraw Hill, 1997. Book chapter</span>
<a name="l00248"></a>00248 <span class="comment"> &lt;em&gt;Generativ and Discriminative Classifiers: Naive Bayes and Logistic</span>
<a name="l00249"></a>00249 <span class="comment"> Regression&lt;/em&gt; available at: http://www.cs.cmu.edu/~tom/NewChapters.html</span>
<a name="l00250"></a>00250 <span class="comment"></span>
<a name="l00251"></a>00251 <span class="comment">[2] Wikipedia, Naive Bayes classifier,</span>
<a name="l00252"></a>00252 <span class="comment"> http://en.wikipedia.org/wiki/Naive_Bayes_classifier</span>
<a name="l00253"></a>00253 <span class="comment"></span>
<a name="l00254"></a>00254 <span class="comment">@sa File bayes.sql_in documenting the SQL functions.</span>
<a name="l00255"></a>00255 <span class="comment"></span>
<a name="l00256"></a>00256 <span class="comment">@internal</span>
<a name="l00257"></a>00257 <span class="comment">@sa namespace bayes (documenting the implementation in Python)</span>
<a name="l00258"></a>00258 <span class="comment">@endinternal</span>
<a name="l00259"></a>00259 <span class="comment"></span>
<a name="l00260"></a>00260 <span class="comment">*/</span>
<a name="l00261"></a>00261
<a name="l00262"></a>00262 -- Begin of argmax definition
<a name="l00263"></a>00263
<a name="l00264"></a>00264 CREATE TYPE MADLIB_SCHEMA.ARGS_AND_VALUE_DOUBLE AS (
<a name="l00265"></a>00265 args INTEGER[],
<a name="l00266"></a>00266 value DOUBLE PRECISION
<a name="l00267"></a>00267 );
<a name="l00268"></a>00268
<a name="l00269"></a>00269 CREATE FUNCTION MADLIB_SCHEMA.argmax_transition(
<a name="l00270"></a>00270 oldmax MADLIB_SCHEMA.ARGS_AND_VALUE_DOUBLE,
<a name="l00271"></a>00271 newkey INTEGER,
<a name="l00272"></a>00272 newvalue DOUBLE PRECISION)
<a name="l00273"></a>00273 RETURNS MADLIB_SCHEMA.ARGS_AND_VALUE_DOUBLE AS
<a name="l00274"></a>00274 $$
<a name="l00275"></a>00275 SELECT CASE WHEN $3 &lt; $1.value OR $2 IS NULL OR ($3 IS NULL AND NOT $1.value IS NULL) THEN $1
<a name="l00276"></a>00276 WHEN $3 = $1.value OR ($3 IS NULL AND $1.value IS NULL AND NOT $1.args IS NULL)
<a name="l00277"></a>00277 THEN ($1.args || $2, $3)::MADLIB_SCHEMA.ARGS_AND_VALUE_DOUBLE
<a name="l00278"></a>00278 ELSE (array[$2], $3)::MADLIB_SCHEMA.ARGS_AND_VALUE_DOUBLE
<a name="l00279"></a>00279 END
<a name="l00280"></a>00280 $$
<a name="l00281"></a>00281 LANGUAGE sql IMMUTABLE;
<a name="l00282"></a>00282
<a name="l00283"></a>00283 CREATE FUNCTION MADLIB_SCHEMA.argmax_combine(
<a name="l00284"></a>00284 max1 MADLIB_SCHEMA.ARGS_AND_VALUE_DOUBLE,
<a name="l00285"></a>00285 max2 MADLIB_SCHEMA.ARGS_AND_VALUE_DOUBLE)
<a name="l00286"></a>00286 RETURNS MADLIB_SCHEMA.ARGS_AND_VALUE_DOUBLE AS
<a name="l00287"></a>00287 $$
<a name="l00288"></a>00288 -- If SQL guaranteed short-circuit evaluation, the following could become
<a name="l00289"></a>00289 -- shorter. Unfortunately, this is not the case.
<a name="l00290"></a>00290 -- Section 6.3.3.3 of ISO/IEC 9075-1:2008 Framework (SQL/Framework):
<a name="l00291"></a>00291 --
<a name="l00292"></a>00292 -- &quot;However, it is implementation-dependent whether expressions are
<a name="l00293"></a>00293 -- actually evaluated left to right, particularly when operands or
<a name="l00294"></a>00294 -- operators might cause conditions to be raised or if the results of the
<a name="l00295"></a>00295 -- expressions can be determined without completely evaluating all parts
<a name="l00296"></a>00296 -- of the expression.&quot;
<a name="l00297"></a>00297 --
<a name="l00298"></a>00298 -- Again, the optimizer does its job hopefully.
<a name="l00299"></a>00299 SELECT CASE WHEN $1 IS NULL THEN $2
<a name="l00300"></a>00300 WHEN $2 IS NULL THEN $1
<a name="l00301"></a>00301 WHEN ($1.value = $2.value) OR ($1.value IS NULL AND $2.value IS NULL)
<a name="l00302"></a>00302 THEN ($1.args || $2.args, $1.value)::MADLIB_SCHEMA.ARGS_AND_VALUE_DOUBLE
<a name="l00303"></a>00303 WHEN $1.value IS NULL OR $1.value &lt; $2.value THEN $2
<a name="l00304"></a>00304 ELSE $1
<a name="l00305"></a>00305 END
<a name="l00306"></a>00306 $$
<a name="l00307"></a>00307 LANGUAGE sql IMMUTABLE;
<a name="l00308"></a>00308
<a name="l00309"></a>00309 CREATE FUNCTION MADLIB_SCHEMA.argmax_final(
<a name="l00310"></a>00310 finalstate MADLIB_SCHEMA.ARGS_AND_VALUE_DOUBLE)
<a name="l00311"></a>00311 RETURNS INTEGER[] AS
<a name="l00312"></a>00312 $$
<a name="l00313"></a>00313 SELECT $1.args
<a name="l00314"></a>00314 $$
<a name="l00315"></a>00315 LANGUAGE sql IMMUTABLE;
<a name="l00316"></a>00316 <span class="comment"></span>
<a name="l00317"></a>00317 <span class="comment">/**</span>
<a name="l00318"></a>00318 <span class="comment"> * @internal</span>
<a name="l00319"></a>00319 <span class="comment"> * @brief Argmax: Return the key of the row for which value is maximal</span>
<a name="l00320"></a>00320 <span class="comment"> *</span>
<a name="l00321"></a>00321 <span class="comment"> * The &quot;index set&quot; of the argmax function is of type INTEGER and we range over</span>
<a name="l00322"></a>00322 <span class="comment"> * DOUBLE PRECISION values. It is not required that all keys are distinct.</span>
<a name="l00323"></a>00323 <span class="comment"> *</span>
<a name="l00324"></a>00324 <span class="comment"> * @note</span>
<a name="l00325"></a>00325 <span class="comment"> * argmax should only be used on unsorted data because it will not exploit</span>
<a name="l00326"></a>00326 <span class="comment"> * indices, and its running time is \f$ \Theta(n) \f$.</span>
<a name="l00327"></a>00327 <span class="comment"> *</span>
<a name="l00328"></a>00328 <span class="comment"> * @implementation</span>
<a name="l00329"></a>00329 <span class="comment"> * The implementation is in SQL, with a flavor of functional programming.</span>
<a name="l00330"></a>00330 <span class="comment"> * The hope is that the optimizer does a good job here.</span>
<a name="l00331"></a>00331 <span class="comment"> */</span>
<a name="l00332"></a>00332 CREATE AGGREGATE MADLIB_SCHEMA.argmax(/*+ key */ INTEGER, /*+ value */ DOUBLE PRECISION) (
<a name="l00333"></a>00333 SFUNC=MADLIB_SCHEMA.argmax_transition,
<a name="l00334"></a>00334 STYPE=MADLIB_SCHEMA.ARGS_AND_VALUE_DOUBLE,
<a name="l00335"></a>00335 m4_ifdef(`__GREENPLUM__&#39;,`prefunc=MADLIB_SCHEMA.argmax_combine,<span class="stringliteral">&#39;)</span>
<a name="l00336"></a>00336 <span class="stringliteral"> FINALFUNC=MADLIB_SCHEMA.argmax_final</span>
<a name="l00337"></a>00337 <span class="stringliteral">);</span>
<a name="l00338"></a>00338 <span class="stringliteral"></span>
<a name="l00339"></a>00339 <span class="stringliteral"></span><span class="comment"></span>
<a name="l00340"></a>00340 <span class="comment">/**</span>
<a name="l00341"></a>00341 <span class="comment"> * @brief Precompute all class priors and feature probabilities</span>
<a name="l00342"></a>00342 <span class="comment"> *</span>
<a name="l00343"></a>00343 <span class="comment"> * Feature probabilities are stored in a table of format</span>
<a name="l00344"></a>00344 <span class="comment"> * &lt;pre&gt;TABLE &lt;em&gt;featureProbsDestName&lt;/em&gt; (</span>
<a name="l00345"></a>00345 <span class="comment"> * class INTEGER,</span>
<a name="l00346"></a>00346 <span class="comment"> * attr INTEGER,</span>
<a name="l00347"></a>00347 <span class="comment"> * value INTEGER,</span>
<a name="l00348"></a>00348 <span class="comment"> * cnt INTEGER,</span>
<a name="l00349"></a>00349 <span class="comment"> * attr_cnt INTEGER</span>
<a name="l00350"></a>00350 <span class="comment"> *)&lt;/pre&gt;</span>
<a name="l00351"></a>00351 <span class="comment"> *</span>
<a name="l00352"></a>00352 <span class="comment"> * Class priors are stored in a table of format</span>
<a name="l00353"></a>00353 <span class="comment"> * &lt;pre&gt;TABLE &lt;em&gt;classPriorsDestName&lt;/em&gt; (</span>
<a name="l00354"></a>00354 <span class="comment"> * class INTEGER,</span>
<a name="l00355"></a>00355 <span class="comment"> * class_cnt INTEGER,</span>
<a name="l00356"></a>00356 <span class="comment"> * all_cnt INTEGER</span>
<a name="l00357"></a>00357 <span class="comment"> *)&lt;/pre&gt;</span>
<a name="l00358"></a>00358 <span class="comment"> *</span>
<a name="l00359"></a>00359 <span class="comment"> * @param trainingSource Name of relation containing the training data</span>
<a name="l00360"></a>00360 <span class="comment"> * @param trainingClassColumn Name of class column in training data</span>
<a name="l00361"></a>00361 <span class="comment"> * @param trainingAttrColumn Name of attributes-array column in training data</span>
<a name="l00362"></a>00362 <span class="comment"> * @param numAttrs Number of attributes to use for classification</span>
<a name="l00363"></a>00363 <span class="comment"> * @param featureProbsDestName Name of feature-probabilities table to create</span>
<a name="l00364"></a>00364 <span class="comment"> * @param classPriorsDestName Name of class-priors table to create</span>
<a name="l00365"></a>00365 <span class="comment"> *</span>
<a name="l00366"></a>00366 <span class="comment"> * @usage</span>
<a name="l00367"></a>00367 <span class="comment"> * Precompute feature probabilities and class priors:</span>
<a name="l00368"></a>00368 <span class="comment"> * &lt;pre&gt;SELECT \ref create_nb_prepared_data_tables(</span>
<a name="l00369"></a>00369 <span class="comment"> * &#39;&lt;em&gt;trainingSource&lt;/em&gt;&#39;, &#39;&lt;em&gt;trainingClassColumn&lt;/em&gt;&#39;, &#39;&lt;em&gt;trainingAttrColumn&lt;/em&gt;&#39;,</span>
<a name="l00370"></a>00370 <span class="comment"> * &lt;em&gt;numAttrs&lt;/em&gt;, &#39;&lt;em&gt;featureProbsName&lt;/em&gt;&#39;, &#39;&lt;em&gt;classPriorsName&lt;/em&gt;&#39;</span>
<a name="l00371"></a>00371 <span class="comment"> *);&lt;/pre&gt;</span>
<a name="l00372"></a>00372 <span class="comment"> *</span>
<a name="l00373"></a>00373 <span class="comment"> * @internal</span>
<a name="l00374"></a>00374 <span class="comment"> * @sa This function is a wrapper for bayes::create_prepared_data().</span>
<a name="l00375"></a>00375 <span class="comment"> */</span>
<a name="l00376"></a>00376 CREATE FUNCTION MADLIB_SCHEMA.create_nb_prepared_data_tables(
<a name="l00377"></a>00377 &quot;trainingSource&quot; VARCHAR,
<a name="l00378"></a>00378 &quot;trainingClassColumn&quot; VARCHAR,
<a name="l00379"></a>00379 &quot;trainingAttrColumn&quot; VARCHAR,
<a name="l00380"></a>00380 &quot;numAttrs&quot; INTEGER,
<a name="l00381"></a>00381 &quot;featureProbsDestName&quot; VARCHAR,
<a name="l00382"></a>00382 &quot;classPriorsDestName&quot; VARCHAR)
<a name="l00383"></a>00383 RETURNS VOID
<a name="l00384"></a>00384 AS $$PythonFunction(bayes, bayes, create_prepared_data_table)$$
<a name="l00385"></a>00385 LANGUAGE plpythonu VOLATILE;
<a name="l00386"></a>00386 <span class="comment"></span>
<a name="l00387"></a>00387 <span class="comment">/**</span>
<a name="l00388"></a>00388 <span class="comment"> * @brief Create a view with columns &lt;tt&gt;(key, nb_classification)&lt;/tt&gt;</span>
<a name="l00389"></a>00389 <span class="comment"> *</span>
<a name="l00390"></a>00390 <span class="comment"> * The created relation will be</span>
<a name="l00391"></a>00391 <span class="comment"> *</span>
<a name="l00392"></a>00392 <span class="comment"> * &lt;tt&gt;{TABLE|VIEW} &lt;em&gt;destName&lt;/em&gt; (key, nb_classification)&lt;/tt&gt;</span>
<a name="l00393"></a>00393 <span class="comment"> *</span>
<a name="l00394"></a>00394 <span class="comment"> * where \c nb_classification is an array containing the most likely</span>
<a name="l00395"></a>00395 <span class="comment"> * class(es) of the record in \em classifySource identified by \c key.</span>
<a name="l00396"></a>00396 <span class="comment"> *</span>
<a name="l00397"></a>00397 <span class="comment"> * @param featureProbsSource Name of table with precomputed feature</span>
<a name="l00398"></a><a class="code" href="bayes_8sql__in.html#aeb4eae7843dd789cc38d5fc57f4ccfb2">00398</a> <span class="comment"> * probabilities, as created with create_nb_prepared_data_tables()</span>
<a name="l00399"></a>00399 <span class="comment"> * @param classPriorsSource Name of table with precomputed class priors, as</span>
<a name="l00400"></a>00400 <span class="comment"> * created with create_nb_prepared_data_tables()</span>
<a name="l00401"></a>00401 <span class="comment"> * @param classifySource Name of the relation that contains data to be classified</span>
<a name="l00402"></a>00402 <span class="comment"> * @param classifyKeyColumn Name of column in \em classifySource that can</span>
<a name="l00403"></a>00403 <span class="comment"> * serve as unique identifier (the key of the source relation)</span>
<a name="l00404"></a>00404 <span class="comment"> * @param classifyAttrColumn Name of attributes-array column in \em classifySource</span>
<a name="l00405"></a>00405 <span class="comment"> * @param numAttrs Number of attributes to use for classification</span>
<a name="l00406"></a>00406 <span class="comment"> * @param destName Name of the view to create</span>
<a name="l00407"></a>00407 <span class="comment"> *</span>
<a name="l00408"></a>00408 <span class="comment"> * @note \c create_nb_classify_view can be called in an ad-hoc fashion. See</span>
<a name="l00409"></a>00409 <span class="comment"> * \ref grp_bayes for instructions.</span>
<a name="l00410"></a>00410 <span class="comment"> *</span>
<a name="l00411"></a>00411 <span class="comment"> * @usage</span>
<a name="l00412"></a>00412 <span class="comment"> * -# Create Naive Bayes classifications view:</span>
<a name="l00413"></a>00413 <span class="comment"> * &lt;pre&gt;SELECT \ref create_nb_classify_view(</span>
<a name="l00414"></a>00414 <span class="comment"> * &#39;&lt;em&gt;featureProbsName&lt;/em&gt;&#39;, &#39;&lt;em&gt;classPriorsName&lt;/em&gt;&#39;,</span>
<a name="l00415"></a>00415 <span class="comment"> * &#39;&lt;em&gt;classifySource&lt;/em&gt;&#39;, &#39;&lt;em&gt;classifyKeyColumn&lt;/em&gt;&#39;, &#39;&lt;em&gt;classifyAttrColumn&lt;/em&gt;&#39;,</span>
<a name="l00416"></a>00416 <span class="comment"> * &lt;em&gt;numAttrs&lt;/em&gt;, &#39;&lt;em&gt;destName&lt;/em&gt;&#39;</span>
<a name="l00417"></a>00417 <span class="comment"> *);&lt;/pre&gt;</span>
<a name="l00418"></a>00418 <span class="comment"> * -# Show Naive Bayes classifications:</span>
<a name="l00419"></a>00419 <span class="comment"> * &lt;pre&gt;SELECT * FROM &lt;em&gt;destName&lt;/em&gt;;&lt;/pre&gt;</span>
<a name="l00420"></a>00420 <span class="comment"> *</span>
<a name="l00421"></a>00421 <span class="comment"> * @internal</span>
<a name="l00422"></a>00422 <span class="comment"> * @sa This function is a wrapper for bayes::create_classification(). See there</span>
<a name="l00423"></a>00423 <span class="comment"> * for details.</span>
<a name="l00424"></a>00424 <span class="comment"> */</span>
<a name="l00425"></a>00425 CREATE FUNCTION MADLIB_SCHEMA.create_nb_classify_view(
<a name="l00426"></a>00426 &quot;featureProbsSource&quot; VARCHAR,
<a name="l00427"></a>00427 &quot;classPriorsSource&quot; VARCHAR,
<a name="l00428"></a>00428 &quot;classifySource&quot; VARCHAR,
<a name="l00429"></a>00429 &quot;classifyKeyColumn&quot; VARCHAR,
<a name="l00430"></a>00430 &quot;classifyAttrColumn&quot; VARCHAR,
<a name="l00431"></a>00431 &quot;numAttrs&quot; INTEGER,
<a name="l00432"></a>00432 &quot;destName&quot; VARCHAR)
<a name="l00433"></a>00433 RETURNS VOID
<a name="l00434"></a>00434 AS $$PythonFunction(bayes, bayes, create_classification_view)$$
<a name="l00435"></a>00435 LANGUAGE plpythonu VOLATILE;
<a name="l00436"></a>00436
<a name="l00437"></a>00437 CREATE FUNCTION MADLIB_SCHEMA.create_nb_classify_view(
<a name="l00438"></a>00438 &quot;trainingSource&quot; VARCHAR,
<a name="l00439"></a>00439 &quot;trainingClassColumn&quot; VARCHAR,
<a name="l00440"></a>00440 &quot;trainingAttrColumn&quot; VARCHAR,
<a name="l00441"></a>00441 &quot;classifySource&quot; VARCHAR,
<a name="l00442"></a>00442 &quot;classifyKeyColumn&quot; VARCHAR,
<a name="l00443"></a>00443 &quot;classifyAttrColumn&quot; VARCHAR,
<a name="l00444"></a>00444 &quot;numAttrs&quot; INTEGER,
<a name="l00445"></a>00445 &quot;destName&quot; VARCHAR)
<a name="l00446"></a>00446 RETURNS VOID
<a name="l00447"></a><a class="code" href="bayes_8sql__in.html#a798402280fc6db710957ae3ab58767e0">00447</a> AS $$PythonFunction(bayes, bayes, create_classification_view)$$
<a name="l00448"></a>00448 LANGUAGE plpythonu VOLATILE;
<a name="l00449"></a>00449
<a name="l00450"></a>00450 <span class="comment"></span>
<a name="l00451"></a>00451 <span class="comment">/**</span>
<a name="l00452"></a>00452 <span class="comment"> * @brief Create view with columns &lt;tt&gt;(key, class, nb_prob)&lt;/tt&gt;</span>
<a name="l00453"></a>00453 <span class="comment"> *</span>
<a name="l00454"></a>00454 <span class="comment"> * The created view will be of the following form:</span>
<a name="l00455"></a>00455 <span class="comment"> *</span>
<a name="l00456"></a>00456 <span class="comment"> * &lt;pre&gt;VIEW &lt;em&gt;destName&lt;/em&gt; (</span>
<a name="l00457"></a>00457 <span class="comment"> * key ANYTYPE,</span>
<a name="l00458"></a>00458 <span class="comment"> * class INTEGER,</span>
<a name="l00459"></a>00459 <span class="comment"> * nb_prob FLOAT8</span>
<a name="l00460"></a>00460 <span class="comment"> *)&lt;/pre&gt;</span>
<a name="l00461"></a>00461 <span class="comment"> *</span>
<a name="l00462"></a>00462 <span class="comment"> * where \c nb_prob is the Naive-Bayes probability that \c class is the true</span>
<a name="l00463"></a>00463 <span class="comment"> * class of the record in \em classifySource identified by \c key.</span>
<a name="l00464"></a>00464 <span class="comment"> *</span>
<a name="l00465"></a>00465 <span class="comment"> * @param featureProbsSource Name of table with precomputed feature</span>
<a name="l00466"></a>00466 <span class="comment"> * probabilities, as created with create_nb_prepared_data_tables()</span>
<a name="l00467"></a>00467 <span class="comment"> * @param classPriorsSource Name of table with precomputed class priors, as</span>
<a name="l00468"></a>00468 <span class="comment"> * created with create_nb_prepared_data_tables()</span>
<a name="l00469"></a>00469 <span class="comment"> * @param classifySource Name of the relation that contains data to be classified</span>
<a name="l00470"></a>00470 <span class="comment"> * @param classifyKeyColumn Name of column in \em classifySource that can</span>
<a name="l00471"></a>00471 <span class="comment"> * serve as unique identifier (the key of the source relation)</span>
<a name="l00472"></a>00472 <span class="comment"> * @param classifyAttrColumn Name of attributes-array column in \em classifySource</span>
<a name="l00473"></a>00473 <span class="comment"> * @param numAttrs Number of attributes to use for classification</span>
<a name="l00474"></a>00474 <span class="comment"> * @param destName Name of the view to create</span>
<a name="l00475"></a>00475 <span class="comment"> *</span>
<a name="l00476"></a>00476 <span class="comment"> * @note \c create_nb_probs_view can be called in an ad-hoc fashion. See</span>
<a name="l00477"></a>00477 <span class="comment"> * \ref grp_bayes for instructions.</span>
<a name="l00478"></a>00478 <span class="comment"> *</span>
<a name="l00479"></a>00479 <span class="comment"> * @usage</span>
<a name="l00480"></a>00480 <span class="comment"> * -# Create Naive Bayes probabilities view:</span>
<a name="l00481"></a>00481 <span class="comment"> * &lt;pre&gt;SELECT \ref create_nb_probs_view(</span>
<a name="l00482"></a>00482 <span class="comment"> * &#39;&lt;em&gt;featureProbsName&lt;/em&gt;&#39;, &#39;&lt;em&gt;classPriorsName&lt;/em&gt;&#39;,</span>
<a name="l00483"></a>00483 <span class="comment"> * &#39;&lt;em&gt;classifySource&lt;/em&gt;&#39;, &#39;&lt;em&gt;classifyKeyColumn&lt;/em&gt;&#39;, &#39;&lt;em&gt;classifyAttrColumn&lt;/em&gt;&#39;,</span>
<a name="l00484"></a>00484 <span class="comment"> * &lt;em&gt;numAttrs&lt;/em&gt;, &#39;&lt;em&gt;destName&lt;/em&gt;&#39;</span>
<a name="l00485"></a>00485 <span class="comment"> *);&lt;/pre&gt;</span>
<a name="l00486"></a>00486 <span class="comment"> * -# Show Naive Bayes probabilities:</span>
<a name="l00487"></a>00487 <span class="comment"> * &lt;pre&gt;SELECT * FROM &lt;em&gt;destName&lt;/em&gt;;&lt;/pre&gt;</span>
<a name="l00488"></a>00488 <span class="comment"> *</span>
<a name="l00489"></a>00489 <span class="comment"> * @internal</span>
<a name="l00490"></a>00490 <span class="comment"> * @sa This function is a wrapper for bayes::create_bayes_probabilities().</span>
<a name="l00491"></a>00491 <span class="comment"> */</span>
<a name="l00492"></a>00492 CREATE FUNCTION MADLIB_SCHEMA.create_nb_probs_view(
<a name="l00493"></a>00493 &quot;featureProbsSource&quot; VARCHAR,
<a name="l00494"></a>00494 &quot;classPriorsSource&quot; VARCHAR,
<a name="l00495"></a>00495 &quot;classifySource&quot; VARCHAR,
<a name="l00496"></a>00496 &quot;classifyKeyColumn&quot; VARCHAR,
<a name="l00497"></a>00497 &quot;classifyAttrColumn&quot; VARCHAR,
<a name="l00498"></a>00498 &quot;numAttrs&quot; INTEGER,
<a name="l00499"></a>00499 &quot;destName&quot; VARCHAR)
<a name="l00500"></a>00500 RETURNS VOID
<a name="l00501"></a>00501 AS $$PythonFunction(bayes, bayes, create_bayes_probabilities_view)$$
<a name="l00502"></a>00502 LANGUAGE plpythonu VOLATILE;
<a name="l00503"></a>00503
<a name="l00504"></a>00504 CREATE FUNCTION MADLIB_SCHEMA.create_nb_probs_view(
<a name="l00505"></a>00505 &quot;trainingSource&quot; VARCHAR,
<a name="l00506"></a>00506 &quot;trainingClassColumn&quot; VARCHAR,
<a name="l00507"></a>00507 &quot;trainingAttrColumn&quot; VARCHAR,
<a name="l00508"></a>00508 &quot;classifySource&quot; VARCHAR,
<a name="l00509"></a>00509 &quot;classifyKeyColumn&quot; VARCHAR,
<a name="l00510"></a>00510 &quot;classifyAttrColumn&quot; VARCHAR,
<a name="l00511"></a>00511 &quot;numAttrs&quot; INTEGER,
<a name="l00512"></a>00512 &quot;destName&quot; VARCHAR)
<a name="l00513"></a>00513 RETURNS VOID
<a name="l00514"></a><a class="code" href="bayes_8sql__in.html#a163afffd0c845d325f060f74bcf02243">00514</a> AS $$PythonFunction(bayes, bayes, create_bayes_probabilities_view)$$
<a name="l00515"></a>00515 LANGUAGE plpythonu VOLATILE;
<a name="l00516"></a>00516
<a name="l00517"></a>00517 <span class="comment"></span>
<a name="l00518"></a>00518 <span class="comment">/**</span>
<a name="l00519"></a>00519 <span class="comment"> * @brief Create a SQL function mapping arrays of attribute values to the Naive</span>
<a name="l00520"></a>00520 <span class="comment"> * Bayes classification.</span>
<a name="l00521"></a>00521 <span class="comment"> *</span>
<a name="l00522"></a>00522 <span class="comment"> * The created SQL function is bound to the given feature probabilities and</span>
<a name="l00523"></a>00523 <span class="comment"> * class priors. Its declaration will be:</span>
<a name="l00524"></a>00524 <span class="comment"> *</span>
<a name="l00525"></a>00525 <span class="comment"> * &lt;tt&gt;</span>
<a name="l00526"></a>00526 <span class="comment"> * FUNCTION &lt;em&gt;destName&lt;/em&gt; (attributes INTEGER[], smoothingFactor DOUBLE PRECISION)</span>
<a name="l00527"></a>00527 <span class="comment"> * RETURNS INTEGER[]&lt;/tt&gt;</span>
<a name="l00528"></a>00528 <span class="comment"> *</span>
<a name="l00529"></a>00529 <span class="comment"> * The return type is \c INTEGER[] because the Naive Bayes classification might</span>
<a name="l00530"></a>00530 <span class="comment"> * be ambiguous (in which case all of the most likely candiates are returned).</span>
<a name="l00531"></a>00531 <span class="comment"> *</span>
<a name="l00532"></a>00532 <span class="comment"> * @param featureProbsSource Name of table with precomputed feature</span>
<a name="l00533"></a>00533 <span class="comment"> * probabilities, as created with create_nb_prepared_data_tables()</span>
<a name="l00534"></a>00534 <span class="comment"> * @param classPriorsSource Name of table with precomputed class priors, as</span>
<a name="l00535"></a>00535 <span class="comment"> * created with create_nb_prepared_data_tables()</span>
<a name="l00536"></a>00536 <span class="comment"> * @param numAttrs Number of attributes to use for classification</span>
<a name="l00537"></a>00537 <span class="comment"> * @param destName Name of the function to create</span>
<a name="l00538"></a>00538 <span class="comment"> *</span>
<a name="l00539"></a>00539 <span class="comment"> * @note</span>
<a name="l00540"></a>00540 <span class="comment"> * Just like \ref create_nb_classify_view and \ref create_nb_probs_view,</span>
<a name="l00541"></a>00541 <span class="comment"> * also \c create_nb_classify_fn can be called in an ad-hoc fashion. See</span>
<a name="l00542"></a>00542 <span class="comment"> * \ref grp_bayes for instructions.</span>
<a name="l00543"></a>00543 <span class="comment"> *</span>
<a name="l00544"></a>00544 <span class="comment"> * @usage</span>
<a name="l00545"></a>00545 <span class="comment"> * -# Create classification function:</span>
<a name="l00546"></a>00546 <span class="comment"> * &lt;pre&gt;SELECT create_nb_classify_fn(</span>
<a name="l00547"></a>00547 <span class="comment"> * &#39;&lt;em&gt;featureProbsSource&lt;/em&gt;&#39;, &#39;&lt;em&gt;classPriorsSource&lt;/em&gt;&#39;,</span>
<a name="l00548"></a>00548 <span class="comment"> * &lt;em&gt;numAttrs&lt;/em&gt;, &#39;&lt;em&gt;destName&lt;/em&gt;&#39;</span>
<a name="l00549"></a>00549 <span class="comment"> *);&lt;/pre&gt;</span>
<a name="l00550"></a>00550 <span class="comment"> * -# Run classification function:</span>
<a name="l00551"></a>00551 <span class="comment"> * &lt;pre&gt;SELECT &lt;em&gt;destName&lt;/em&gt;(&lt;em&gt;attributes&lt;/em&gt;, &lt;em&gt;smoothingFactor&lt;/em&gt;);&lt;/pre&gt;</span>
<a name="l00552"></a>00552 <span class="comment"> *</span>
<a name="l00553"></a>00553 <span class="comment"> * @note</span>
<a name="l00554"></a>00554 <span class="comment"> * On Greenplum, the generated SQL function can only be called on the master.</span>
<a name="l00555"></a>00555 <span class="comment"> *</span>
<a name="l00556"></a>00556 <span class="comment"> * @internal</span>
<a name="l00557"></a>00557 <span class="comment"> * @sa This function is a wrapper for bayes::create_classification_function().</span>
<a name="l00558"></a>00558 <span class="comment"> */</span>
<a name="l00559"></a>00559 CREATE FUNCTION MADLIB_SCHEMA.create_nb_classify_fn(
<a name="l00560"></a>00560 &quot;featureProbsSource&quot; VARCHAR,
<a name="l00561"></a>00561 &quot;classPriorsSource&quot; VARCHAR,
<a name="l00562"></a>00562 &quot;numAttrs&quot; INTEGER,
<a name="l00563"></a>00563 &quot;destName&quot; VARCHAR)
<a name="l00564"></a>00564 RETURNS VOID
<a name="l00565"></a>00565 AS $$PythonFunction(bayes, bayes, create_classification_function)$$
<a name="l00566"></a>00566 LANGUAGE plpythonu VOLATILE;
<a name="l00567"></a>00567
<a name="l00568"></a>00568 CREATE FUNCTION MADLIB_SCHEMA.create_nb_classify_fn(
<a name="l00569"></a>00569 &quot;trainingSource&quot; VARCHAR,
<a name="l00570"></a>00570 &quot;trainingClassColumn&quot; VARCHAR,
<a name="l00571"></a>00571 &quot;trainingAttrColumn&quot; VARCHAR,
<a name="l00572"></a>00572 &quot;numAttrs&quot; INTEGER,
<a name="l00573"></a>00573 &quot;destName&quot; VARCHAR)
<a name="l00574"></a>00574 RETURNS VOID
<a name="l00575"></a>00575 AS $$PythonFunction(bayes, bayes, create_classification_function)$$
<a name="l00576"></a>00576 LANGUAGE plpythonu VOLATILE;
</pre></div></div>
</div>
<div id="nav-path" class="navpath">
<ul>
<li class="navelem"><a class="el" href="bayes_8sql__in.html">bayes.sql_in</a> </li>
<!-- window showing the filter options -->
<div id="MSearchSelectWindow"
onmouseover="return searchBox.OnSearchSelectShow()"
onmouseout="return searchBox.OnSearchSelectHide()"
onkeydown="return searchBox.OnSearchSelectKey(event)">
<a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(0)"><span class="SelectionMark">&#160;</span>All</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(1)"><span class="SelectionMark">&#160;</span>Files</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(2)"><span class="SelectionMark">&#160;</span>Functions</a></div>
<!-- iframe showing the search results (closed by default) -->
<div id="MSearchResultsWindow">
<iframe src="javascript:void(0)" frameborder="0"
name="MSearchResults" id="MSearchResults">
</iframe>
</div>
<li class="footer">Generated on Tue Apr 2 2013 14:57:03 for MADlib by
<a href="http://www.doxygen.org/index.html">
<img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.7.5.1 </li>
</ul>
</div>
</body>
</html>