| <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> |
| <html xmlns="http://www.w3.org/1999/xhtml"> |
| <head> |
| <meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/> |
| <title>MADlib: kmeans.sql_in Source File</title> |
| |
| <link href="tabs.css" rel="stylesheet" type="text/css"/> |
| <link href="doxygen.css" rel="stylesheet" type="text/css" /> |
| <link href="navtree.css" rel="stylesheet" type="text/css"/> |
| <script type="text/javascript" src="jquery.js"></script> |
| <script type="text/javascript" src="resize.js"></script> |
| <script type="text/javascript" src="navtree.js"></script> |
| <script type="text/javascript"> |
| $(document).ready(initResizable); |
| </script> |
| <link href="search/search.css" rel="stylesheet" type="text/css"/> |
| <script type="text/javascript" src="search/search.js"></script> |
| <script type="text/javascript"> |
| $(document).ready(function() { searchBox.OnSelectItem(0); }); |
| </script> |
| <script src="../mathjax/MathJax.js"> |
| MathJax.Hub.Config({ |
| extensions: ["tex2jax.js", "TeX/AMSmath.js", "TeX/AMSsymbols.js"], |
| jax: ["input/TeX","output/HTML-CSS"], |
| }); |
| </script> |
| </head> |
| <body> |
| <div id="top"><!-- do not remove this div! --> |
| |
| |
| <div id="titlearea"> |
| <table cellspacing="0" cellpadding="0"> |
| <tbody> |
| <tr style="height: 56px;"> |
| |
| |
| <td style="padding-left: 0.5em;"> |
| <div id="projectname">MADlib |
|  <span id="projectnumber">0.7</span> <span style="font-size:10pt; font-style:italic"><a href="../latest/./kmeans_8sql__in_source.html"> A newer version is available</a></span> |
| </div> |
| <div id="projectbrief">User Documentation</div> |
| </td> |
| |
| |
| |
| </tr> |
| </tbody> |
| </table> |
| </div> |
| |
| <!-- Generated by Doxygen 1.7.5.1 --> |
| <script type="text/javascript"> |
| var searchBox = new SearchBox("searchBox", "search",false,'Search'); |
| </script> |
| <script type="text/javascript" src="dynsections.js"></script> |
| <div id="navrow1" class="tabs"> |
| <ul class="tablist"> |
| <li><a href="index.html"><span>Main Page</span></a></li> |
| <li><a href="modules.html"><span>Modules</span></a></li> |
| <li class="current"><a href="files.html"><span>Files</span></a></li> |
| <li> |
| <div id="MSearchBox" class="MSearchBoxInactive"> |
| <span class="left"> |
| <img id="MSearchSelect" src="search/mag_sel.png" |
| onmouseover="return searchBox.OnSearchSelectShow()" |
| onmouseout="return searchBox.OnSearchSelectHide()" |
| alt=""/> |
| <input type="text" id="MSearchField" value="Search" accesskey="S" |
| onfocus="searchBox.OnSearchFieldFocus(true)" |
| onblur="searchBox.OnSearchFieldFocus(false)" |
| onkeyup="searchBox.OnSearchFieldChange(event)"/> |
| </span><span class="right"> |
| <a id="MSearchClose" href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" border="0" src="search/close.png" alt=""/></a> |
| </span> |
| </div> |
| </li> |
| </ul> |
| </div> |
| <div id="navrow2" class="tabs2"> |
| <ul class="tablist"> |
| <li><a href="files.html"><span>File List</span></a></li> |
| <li><a href="globals.html"><span>File Members</span></a></li> |
| </ul> |
| </div> |
| </div> |
| <div id="side-nav" class="ui-resizable side-nav-resizable"> |
| <div id="nav-tree"> |
| <div id="nav-tree-contents"> |
| </div> |
| </div> |
| <div id="splitbar" style="-moz-user-select:none;" |
| class="ui-resizable-handle"> |
| </div> |
| </div> |
| <script type="text/javascript"> |
| initNavTree('kmeans_8sql__in.html',''); |
| </script> |
| <div id="doc-content"> |
| <div class="header"> |
| <div class="headertitle"> |
| <div class="title">kmeans.sql_in</div> </div> |
| </div> |
| <div class="contents"> |
| <a href="kmeans_8sql__in.html">Go to the documentation of this file.</a><div class="fragment"><pre class="fragment"><a name="l00001"></a>00001 <span class="comment">/* ----------------------------------------------------------------------- */</span><span class="comment">/**</span> |
| <a name="l00002"></a>00002 <span class="comment"> *</span> |
| <a name="l00003"></a>00003 <span class="comment"> * @file kmeans.sql_in</span> |
| <a name="l00004"></a>00004 <span class="comment"> *</span> |
| <a name="l00005"></a>00005 <span class="comment"> * @brief Set of functions for k-means clustering.</span> |
| <a name="l00006"></a>00006 <span class="comment"> *</span> |
| <a name="l00007"></a>00007 <span class="comment"> * @sa For a brief introduction to k-means clustering, see the module</span> |
| <a name="l00008"></a>00008 <span class="comment"> * description \ref grp_kmeans.</span> |
| <a name="l00009"></a>00009 <span class="comment"> *</span> |
| <a name="l00010"></a>00010 <span class="comment"> */</span><span class="comment">/* ----------------------------------------------------------------------- */</span> |
| <a name="l00011"></a>00011 |
| <a name="l00012"></a>00012 m4_include(`SQLCommon.m4<span class="stringliteral">')</span> |
| <a name="l00013"></a>00013 <span class="stringliteral"></span><span class="comment"></span> |
| <a name="l00014"></a>00014 <span class="comment">/**</span> |
| <a name="l00015"></a>00015 <span class="comment">@addtogroup grp_kmeans</span> |
| <a name="l00016"></a>00016 <span class="comment"></span> |
| <a name="l00017"></a>00017 <span class="comment">@about</span> |
| <a name="l00018"></a>00018 <span class="comment"></span> |
| <a name="l00019"></a>00019 <span class="comment">Clustering refers to the problem of partitioning a set of objects according to</span> |
| <a name="l00020"></a>00020 <span class="comment">some problem-dependent measure of <em>similarity</em>. In the k-means variant,</span> |
| <a name="l00021"></a>00021 <span class="comment">one is given \f$ n \f$ points \f$ x_1, \dots, x_n \in \mathbb R^d \f$, and the</span> |
| <a name="l00022"></a>00022 <span class="comment">goal is to position \f$ k \f$ centroids \f$ c_1, \dots, c_k \in \mathbb R^d \f$</span> |
| <a name="l00023"></a>00023 <span class="comment">so that the sum of \em distances between each point and its closest centroid is</span> |
| <a name="l00024"></a>00024 <span class="comment">minimized. Each centroid represents a cluster that consists of all points to</span> |
| <a name="l00025"></a>00025 <span class="comment">which this centroid is closest. Formally, we wish to minimize the following</span> |
| <a name="l00026"></a>00026 <span class="comment">objective function:</span> |
| <a name="l00027"></a>00027 <span class="comment">\f[</span> |
| <a name="l00028"></a>00028 <span class="comment"> (c_1, \dots, c_k) \mapsto \sum_{i=1}^n \min_{j=1}^k \operatorname{dist}(x_i, c_j)</span> |
| <a name="l00029"></a>00029 <span class="comment">\f]</span> |
| <a name="l00030"></a>00030 <span class="comment">In the most common case, \f$ \operatorname{dist} \f$ is the square of the</span> |
| <a name="l00031"></a>00031 <span class="comment">Euclidean distance.</span> |
| <a name="l00032"></a>00032 <span class="comment"></span> |
| <a name="l00033"></a>00033 <span class="comment">This problem is computationally difficult (NP-hard), yet the</span> |
| <a name="l00034"></a>00034 <span class="comment">local-search heuristic proposed by Lloyd [4] performs reasonably well in</span> |
| <a name="l00035"></a>00035 <span class="comment">practice. In fact, it is so ubiquitous today that it is</span> |
| <a name="l00036"></a>00036 <span class="comment">often referred to as the <em>standard algorithm</em> or even just the</span> |
| <a name="l00037"></a>00037 <span class="comment"><em>k-means algorithm</em> [1]. It works as follows:</span> |
| <a name="l00038"></a>00038 <span class="comment"></span> |
| <a name="l00039"></a>00039 <span class="comment">-# Seed the \f$ k \f$ centroids (see below)</span> |
| <a name="l00040"></a>00040 <span class="comment">-# Repeat until convergence:</span> |
| <a name="l00041"></a>00041 <span class="comment"> -# Assign each point to its closest centroid</span> |
| <a name="l00042"></a>00042 <span class="comment"> -# Move each centroid to a position that minimizes the sum of distances in this</span> |
| <a name="l00043"></a>00043 <span class="comment"> cluster</span> |
| <a name="l00044"></a>00044 <span class="comment">-# Convergence is achieved when no points change their assignments during step</span> |
| <a name="l00045"></a>00045 <span class="comment"> 2a.</span> |
| <a name="l00046"></a>00046 <span class="comment"></span> |
| <a name="l00047"></a>00047 <span class="comment">Since the objective function decreases in every step, this algorithm is</span> |
| <a name="l00048"></a>00048 <span class="comment">guaranteed to converge to a local optimum.</span> |
| <a name="l00049"></a>00049 <span class="comment"></span> |
| <a name="l00050"></a>00050 <span class="comment">@implementation</span> |
| <a name="l00051"></a>00051 <span class="comment"></span> |
| <a name="l00052"></a>00052 <span class="comment">Data points and predefined centroids (if used) are expected to be stored row-wise,</span> |
| <a name="l00053"></a>00053 <span class="comment">in a column of type <tt>\ref grp_svec "SVEC"</tt> (or any type convertible to</span> |
| <a name="l00054"></a>00054 <span class="comment"><tt>\ref grp_svec "SVEC"</tt>, like <tt>FLOAT[]</tt> or <tt>INTEGER[]</tt>).</span> |
| <a name="l00055"></a>00055 <span class="comment">Data points with non-finite values (NULL, NaN, infinity) in any component</span> |
| <a name="l00056"></a>00056 <span class="comment">will be skipped during analysis.</span> |
| <a name="l00057"></a>00057 <span class="comment"></span> |
| <a name="l00058"></a>00058 <span class="comment">The following methods are available for the centroid seeding:</span> |
| <a name="l00059"></a>00059 <span class="comment"> - <strong>random selection</strong>:</span> |
| <a name="l00060"></a>00060 <span class="comment"> Select \f$ k \f$ centroids randomly among the input points.</span> |
| <a name="l00061"></a>00061 <span class="comment"> - <strong>kmeans++</strong> [2]:</span> |
| <a name="l00062"></a>00062 <span class="comment"> Start with a single centroid chosen randomly among the input points. Then</span> |
| <a name="l00063"></a>00063 <span class="comment"> iteratively choose new</span> |
| <a name="l00064"></a>00064 <span class="comment"> centroids from the input points until there is a total of \f$ k \f$</span> |
| <a name="l00065"></a>00065 <span class="comment"> centroids. The probability for picking a particular point is proportional to</span> |
| <a name="l00066"></a>00066 <span class="comment"> its minimum distance to any existing centroid.</span> |
| <a name="l00067"></a>00067 <span class="comment"> \n</span> |
| <a name="l00068"></a>00068 <span class="comment"> Intuitively, kmeans++ favors seedings where centroids are spread out over the</span> |
| <a name="l00069"></a>00069 <span class="comment"> whole range of the input points, while at the same time not being too</span> |
| <a name="l00070"></a>00070 <span class="comment"> susceptible to outliers [2].</span> |
| <a name="l00071"></a>00071 <span class="comment"> - <strong>user-specified set of initial centroids</strong>:</span> |
| <a name="l00072"></a>00072 <span class="comment"> See below for a description of the expected format of the set of initial</span> |
| <a name="l00073"></a>00073 <span class="comment"> centroids.</span> |
| <a name="l00074"></a>00074 <span class="comment"></span> |
| <a name="l00075"></a>00075 <span class="comment">The following distance functions can be used (computation of barycenter/mean in parentheses):</span> |
| <a name="l00076"></a>00076 <span class="comment"> - <strong>\ref dist_norm1</strong>: 1-norm/Manhattan (element-wise median [Note that MADlib does not provide a median</span> |
| <a name="l00077"></a>00077 <span class="comment"> aggregate function for support and performance reasons.])</span> |
| <a name="l00078"></a>00078 <span class="comment"> - <strong>\ref dist_norm2</strong>: 2-norm/Euclidean (element-wise mean)</span> |
| <a name="l00079"></a>00079 <span class="comment"> - <strong>\ref squared_dist_norm2</strong>: squared Euclidean distance (element-wise mean)</span> |
| <a name="l00080"></a>00080 <span class="comment"> - <strong>\ref dist_angle</strong>: angle (element-wise mean of normalized points)</span> |
| <a name="l00081"></a>00081 <span class="comment"> - <strong>\ref dist_tanimoto</strong>: tanimoto (element-wise mean of normalized points [5])</span> |
| <a name="l00082"></a>00082 <span class="comment"> - <strong>user defined function</strong> with signature DOUBLE PRECISION[] x DOUBLE PRECISION[] -> DOUBLE PRECISION</span> |
| <a name="l00083"></a>00083 <span class="comment"></span> |
| <a name="l00084"></a>00084 <span class="comment">The following aggregate functions for determining centroids can be used:</span> |
| <a name="l00085"></a>00085 <span class="comment"> - <strong>\ref avg</strong>: average</span> |
| <a name="l00086"></a>00086 <span class="comment"> - <strong>\ref normalized_avg</strong>: normalized average</span> |
| <a name="l00087"></a>00087 <span class="comment"></span> |
| <a name="l00088"></a>00088 <span class="comment">The algorithm stops when one of the following conditions is met:</span> |
| <a name="l00089"></a>00089 <span class="comment"> - The fraction of updated points is smaller than the convergence threshold (default: 0.001).</span> |
| <a name="l00090"></a>00090 <span class="comment"> - The algorithm reaches the maximum number of allowed iterations (default: 20).</span> |
| <a name="l00091"></a>00091 <span class="comment"></span> |
| <a name="l00092"></a>00092 <span class="comment">A popular method to assess the quality of the clustering is the</span> |
| <a name="l00093"></a>00093 <span class="comment"><em>silhouette coefficient</em>, a simplified version of which is provided as part of the k-means module.</span> |
| <a name="l00094"></a>00094 <span class="comment">Note that for large data sets, this computation is expensive.</span> |
| <a name="l00095"></a>00095 <span class="comment"></span> |
| <a name="l00096"></a>00096 <span class="comment">@input</span> |
| <a name="l00097"></a>00097 <span class="comment">The <strong>source relation</strong> is expected to be of the following form (or</span> |
| <a name="l00098"></a>00098 <span class="comment">to be implicitly convertible into the following form):</span> |
| <a name="l00099"></a>00099 <span class="comment"><pre>{TABLE|VIEW} <em>rel_source</em> (</span> |
| <a name="l00100"></a>00100 <span class="comment"> ...</span> |
| <a name="l00101"></a>00101 <span class="comment"> <em>expr_points</em> FLOAT8[],</span> |
| <a name="l00102"></a>00102 <span class="comment"> ...</span> |
| <a name="l00103"></a>00103 <span class="comment">)</pre></span> |
| <a name="l00104"></a>00104 <span class="comment">where:</span> |
| <a name="l00105"></a>00105 <span class="comment"> - <em>expr_points</em> is the name of a column with point coordinates.</span> |
| <a name="l00106"></a>00106 <span class="comment"> Types such as \c svec or <tt>INTEGER[]</tt> are implicitly converted to</span> |
| <a name="l00107"></a>00107 <span class="comment"> <tt>FLOAT8[]</tt>.</span> |
| <a name="l00108"></a>00108 <span class="comment"></span> |
| <a name="l00109"></a>00109 <span class="comment">If kmeans is called with a set of initial centroids, the centroid relation is</span> |
| <a name="l00110"></a>00110 <span class="comment">expected to be of the following form:</span> |
| <a name="l00111"></a>00111 <span class="comment"><pre>{TABLE|VIEW} <em>rel_initial_centroids</em> (</span> |
| <a name="l00112"></a>00112 <span class="comment"> ...</span> |
| <a name="l00113"></a>00113 <span class="comment"> <em>expr_centroid</em> DOUBLE PRECISION[],</span> |
| <a name="l00114"></a>00114 <span class="comment"> ...</span> |
| <a name="l00115"></a>00115 <span class="comment">)</pre></span> |
| <a name="l00116"></a>00116 <span class="comment">where:</span> |
| <a name="l00117"></a>00117 <span class="comment"> - <em>expr_centroid</em> is the name of a column with coordinates.</span> |
| <a name="l00118"></a>00118 <span class="comment"></span> |
| <a name="l00119"></a>00119 <span class="comment">@usage</span> |
| <a name="l00120"></a>00120 <span class="comment">The k-means algorithm can be invoked in four possible ways:</span> |
| <a name="l00121"></a>00121 <span class="comment"></span> |
| <a name="l00122"></a>00122 <span class="comment">- using <em>random</em> centroid seeding method for a</span> |
| <a name="l00123"></a>00123 <span class="comment">provided \f$ k \f$:</span> |
| <a name="l00124"></a>00124 <span class="comment"><pre>SELECT * FROM \ref kmeans_random(</span> |
| <a name="l00125"></a>00125 <span class="comment"> '<em>rel_source</em>', '<em>expr_point</em>', k,</span> |
| <a name="l00126"></a>00126 <span class="comment"> [ '<em>fn_dist</em>', '<em>agg_centroid</em>',</span> |
| <a name="l00127"></a>00127 <span class="comment"> <em>max_num_iterations</em>, <em>min_frac_reassigned</em> ]</span> |
| <a name="l00128"></a>00128 <span class="comment">);</pre></span> |
| <a name="l00129"></a>00129 <span class="comment"></span> |
| <a name="l00130"></a>00130 <span class="comment">- using <em>kmeans++</em> centroid seeding method for a</span> |
| <a name="l00131"></a>00131 <span class="comment">provided \f$ k \f$:</span> |
| <a name="l00132"></a>00132 <span class="comment"><pre>SELECT * FROM \ref kmeanspp(</span> |
| <a name="l00133"></a>00133 <span class="comment"> '<em>rel_source</em>', '<em>expr_point</em>', k,</span> |
| <a name="l00134"></a>00134 <span class="comment"> [ '<em>fn_dist</em>', '<em>agg_centroid</em>',</span> |
| <a name="l00135"></a>00135 <span class="comment"> <em>max_num_iterations</em>, <em>min_frac_reassigned</em> ]</span> |
| <a name="l00136"></a>00136 <span class="comment">);</pre></span> |
| <a name="l00137"></a>00137 <span class="comment"></span> |
| <a name="l00138"></a>00138 <span class="comment">- with a provided centroid set:</span> |
| <a name="l00139"></a>00139 <span class="comment"><pre>SELECT * FROM \ref kmeans(</span> |
| <a name="l00140"></a>00140 <span class="comment"> '<em>rel_source</em>', '<em>expr_point</em>',</span> |
| <a name="l00141"></a>00141 <span class="comment"> '<em>rel_initial_centroids</em>', '<em>expr_centroid</em>',</span> |
| <a name="l00142"></a>00142 <span class="comment"> [ '<em>fn_dist</em>', '<em>agg_centroid</em>',</span> |
| <a name="l00143"></a>00143 <span class="comment"> <em>max_num_iterations</em>, <em>min_frac_reassigned</em> ]</span> |
| <a name="l00144"></a>00144 <span class="comment">);</pre></span> |
| <a name="l00145"></a>00145 <span class="comment">------------ OR ---------------</span> |
| <a name="l00146"></a>00146 <span class="comment"><pre>SELECT * FROM \ref kmeans(</span> |
| <a name="l00147"></a>00147 <span class="comment"> '<em>rel_source</em>', '<em>expr_point</em>',</span> |
| <a name="l00148"></a>00148 <span class="comment"> initial_centroids,</span> |
| <a name="l00149"></a>00149 <span class="comment"> [ '<em>fn_dist</em>', '<em>agg_centroid</em>',</span> |
| <a name="l00150"></a>00150 <span class="comment"> <em>max_num_iterations</em>, <em>min_frac_reassigned</em> ]</span> |
| <a name="l00151"></a>00151 <span class="comment">);</pre></span> |
| <a name="l00152"></a>00152 <span class="comment">where:</span> |
| <a name="l00153"></a>00153 <span class="comment"> - <em>initial_centroids</em> is of type <tt>DOUBLE PRECISION[][]</tt>.</span> |
| <a name="l00154"></a>00154 <span class="comment"></span> |
| <a name="l00155"></a>00155 <span class="comment">The output of the k-means module is a table that includes the final</span> |
| <a name="l00156"></a>00156 <span class="comment">centroid positions (DOUBLE PRECISION[][]), the objective function,</span> |
| <a name="l00157"></a>00157 <span class="comment">the fraction of reassigned points in the last iteration, and</span> |
| <a name="l00158"></a>00158 <span class="comment">the number of total iterations:</span> |
| <a name="l00159"></a>00159 <span class="comment"><pre></span> |
| <a name="l00160"></a>00160 <span class="comment"> centroids | objective_fn | frac_reassigned | num_iterations</span> |
| <a name="l00161"></a>00161 <span class="comment">----------------------------------+------------------+-----------------+----------------</span> |
| <a name="l00162"></a>00162 <span class="comment"> ...</span> |
| <a name="l00163"></a>00163 <span class="comment"></pre></span> |
| <a name="l00164"></a>00164 <span class="comment"></span> |
| <a name="l00165"></a>00165 <span class="comment">@examp</span> |
| <a name="l00166"></a>00166 <span class="comment"></span> |
| <a name="l00167"></a>00167 <span class="comment">-# Prepare some input data.</span> |
| <a name="l00168"></a>00168 <span class="comment">\code</span> |
| <a name="l00169"></a>00169 <span class="comment">sql> SELECT * FROM public.km_sample LIMIT 5;</span> |
| <a name="l00170"></a>00170 <span class="comment"> points</span> |
| <a name="l00171"></a>00171 <span class="comment">-------------------------------------------</span> |
| <a name="l00172"></a>00172 <span class="comment"> {1,1}:{15.8822241332382,105.945462542586}</span> |
| <a name="l00173"></a>00173 <span class="comment"> {1,1}:{34.5065216883086,72.3126099305227}</span> |
| <a name="l00174"></a>00174 <span class="comment"> {1,1}:{22.5074400822632,95.3209559689276}</span> |
| <a name="l00175"></a>00175 <span class="comment"> {1,1}:{70.2589857042767,68.7395178806037}</span> |
| <a name="l00176"></a>00176 <span class="comment"> {1,1}:{30.9844257542863,25.3213323024102}</span> |
| <a name="l00177"></a>00177 <span class="comment">(5 rows)</span> |
| <a name="l00178"></a>00178 <span class="comment">\endcode</span> |
| <a name="l00179"></a>00179 <span class="comment">Note: the example <em>points</em> is type <tt>\ref grp_svec "SVEC"</tt>.</span> |
| <a name="l00180"></a>00180 <span class="comment">-# Run k-means clustering using kmeans++ for centroid seeding:</span> |
| <a name="l00181"></a>00181 <span class="comment">\code</span> |
| <a name="l00182"></a>00182 <span class="comment">sql> SELECT * FROM madlib.kmeanspp('km_sample', 'points', 2, 'madlib.squared_dist_norm2', 'madlib.avg', 20, 0.001);</span> |
| <a name="l00183"></a>00183 <span class="comment">);</span> |
| <a name="l00184"></a>00184 <span class="comment"> centroids | objective_fn | frac_reassigned | num_iterations</span> |
| <a name="l00185"></a>00185 <span class="comment">-------------------------------------------------------------------------+------------------+-----------------+----------------</span> |
| <a name="l00186"></a>00186 <span class="comment"> {{68.01668579784,48.9667382972952},{28.1452167573446,84.5992507653263}} | 586729.010675982 | 0.001 | 5</span> |
| <a name="l00187"></a>00187 <span class="comment">\endcode</span> |
| <a name="l00188"></a>00188 <span class="comment">-# Calculate the simplified silhouette coefficient:</span> |
| <a name="l00189"></a>00189 <span class="comment">\code</span> |
| <a name="l00190"></a>00190 <span class="comment">sql> SELECT * from madlib.simple_silhouette('km_test_svec','points',</span> |
| <a name="l00191"></a>00191 <span class="comment"> (select centroids from madlib.kmeanspp('km_test_svec','points',2,'madlib.squared_dist_norm2','madlib.avg',20,0.001)),</span> |
| <a name="l00192"></a>00192 <span class="comment"> 'madlib.dist_norm2');</span> |
| <a name="l00193"></a>00193 <span class="comment"> simple_silhouette</span> |
| <a name="l00194"></a>00194 <span class="comment">-------------------</span> |
| <a name="l00195"></a>00195 <span class="comment"> 0.611022970398174</span> |
| <a name="l00196"></a>00196 <span class="comment">\endcode</span> |
| <a name="l00197"></a>00197 <span class="comment"></span> |
| <a name="l00198"></a>00198 <span class="comment">@literature</span> |
| <a name="l00199"></a>00199 <span class="comment"></span> |
| <a name="l00200"></a>00200 <span class="comment">[1] Wikipedia, K-means Clustering,</span> |
| <a name="l00201"></a>00201 <span class="comment"> http://en.wikipedia.org/wiki/K-means_clustering</span> |
| <a name="l00202"></a>00202 <span class="comment"></span> |
| <a name="l00203"></a>00203 <span class="comment">[2] David Arthur, Sergei Vassilvitskii: k-means++: the advantages of careful</span> |
| <a name="l00204"></a>00204 <span class="comment"> seeding, Proceedings of the 18th Annual ACM-SIAM Symposium on Discrete</span> |
| <a name="l00205"></a>00205 <span class="comment"> Algorithms (SODA'07), pp. 1027-1035,</span> |
| <a name="l00206"></a>00206 <span class="comment"> http://www.stanford.edu/~darthur/kMeansPlusPlus.pdf</span> |
| <a name="l00207"></a>00207 <span class="comment"></span> |
| <a name="l00208"></a>00208 <span class="comment">[3] E. R. Hruschka, L. N. C. Silva, R. J. G. B. Campello: Clustering</span> |
| <a name="l00209"></a>00209 <span class="comment"> Gene-Expression Data: A Hybrid Approach that Iterates Between k-Means and</span> |
| <a name="l00210"></a>00210 <span class="comment"> Evolutionary Search. In: Studies in Computational Intelligence - Hybrid</span> |
| <a name="l00211"></a>00211 <span class="comment"> Evolutionary Algorithms. pp. 313-335. Springer. 2007.</span> |
| <a name="l00212"></a>00212 <span class="comment"></span> |
| <a name="l00213"></a>00213 <span class="comment">[4] Lloyd, Stuart: Least squares quantization in PCM. Technical Note, Bell</span> |
| <a name="l00214"></a>00214 <span class="comment"> Laboratories. Published much later in: IEEE Transactions on Information</span> |
| <a name="l00215"></a>00215 <span class="comment"> Theory 28(2), pp. 128-137. 1982.</span> |
| <a name="l00216"></a>00216 <span class="comment"></span> |
| <a name="l00217"></a>00217 <span class="comment">[5] Leisch, Friedrich: A Toolbox for K-Centroids Cluster Analysis. In: Computational</span> |
| <a name="l00218"></a>00218 <span class="comment"> Statistics and Data Analysis, 51(2). pp. 526-544. 2006.</span> |
| <a name="l00219"></a>00219 <span class="comment"></span> |
| <a name="l00220"></a>00220 <span class="comment">@sa File kmeans.sql_in documenting the SQL functions.</span> |
| <a name="l00221"></a>00221 <span class="comment"></span> |
| <a name="l00222"></a>00222 <span class="comment">@internal</span> |
| <a name="l00223"></a>00223 <span class="comment">@sa namespace kmeans (documenting the implementation in Python)</span> |
| <a name="l00224"></a>00224 <span class="comment">@endinternal</span> |
| <a name="l00225"></a>00225 <span class="comment">*/</span> |
| <a name="l00226"></a>00226 |
| <a name="l00227"></a>00227 /* |
| <a name="l00228"></a>00228 * @brief k-Means return type |
| <a name="l00229"></a>00229 * |
| <a name="l00230"></a>00230 * A composite value: |
| <a name="l00231"></a>00231 * - <tt>centroids</tt> - Matrix containing the new \f$ l \leq k \f$ |
| <a name="l00232"></a>00232 * repositioned centroids as columns. If this matrix has \f$ l < k \f$ |
| <a name="l00233"></a>00233 * columns, one or more old centroids no longer were closest to any point. |
| <a name="l00234"></a>00234 * - <tt>old_centroid_its</tt> - The order of the centroids in |
| <a name="l00235"></a>00235 * <tt>centroid</tt> is not guaranteed to be consitent across iterations. |
| <a name="l00236"></a>00236 * In particular, if a centroid is no longer closest to any point it can be |
| <a name="l00237"></a>00237 * dropped and a new centroid is added afterwards. We therefore need to map |
| <a name="l00238"></a>00238 * positions in <tt>centroids</tt> to the respective positions in the |
| <a name="l00239"></a>00239 * previous iteration. |
| <a name="l00240"></a>00240 * - <tt>objective_fn</tt> - Value of the objective function, i.e., |
| <a name="l00241"></a>00241 * \f$ \sum_{x \in P} \dist(x, C)^2 \f$ where |
| <a name="l00242"></a>00242 * \f$ P \f$ is the set of points, \f$ C \f$ is the set of centroids, and |
| <a name="l00243"></a>00243 * \f$ \dist(x, C) := \min_{c \in C} \operatorname{dist}(x, c) \f$. |
| <a name="l00244"></a>00244 * - <tt>frac_reassigned</tt> - Fraction of points that was assigned a |
| <a name="l00245"></a>00245 * different centroid in the current iteration. |
| <a name="l00246"></a>00246 * - <tt>num_iterations</tt> - Number of iterations performed (so far). |
| <a name="l00247"></a>00247 */ |
| <a name="l00248"></a>00248 CREATE TYPE MADLIB_SCHEMA.kmeans_result AS ( |
| <a name="l00249"></a>00249 centroids DOUBLE PRECISION[][], |
| <a name="l00250"></a>00250 objective_fn DOUBLE PRECISION, |
| <a name="l00251"></a>00251 frac_reassigned DOUBLE PRECISION, |
| <a name="l00252"></a>00252 num_iterations INTEGER |
| <a name="l00253"></a>00253 ); |
| <a name="l00254"></a>00254 |
| <a name="l00255"></a>00255 /* |
| <a name="l00256"></a>00256 * @brief k-Means inter-iteration state type |
| <a name="l00257"></a>00257 * |
| <a name="l00258"></a>00258 * A composite value like \ref{kmeans_result}. Additional fields: |
| <a name="l00259"></a>00259 * - <tt>old_centroid_its</tt> - The order of the centroids in |
| <a name="l00260"></a>00260 * <tt>centroid</tt> is not guaranteed to be consitent across iterations. |
| <a name="l00261"></a>00261 * In particular, if a centroid is no longer closest to any point it can be |
| <a name="l00262"></a>00262 * dropped and a new centroid is added afterwards. We therefore need to map |
| <a name="l00263"></a>00263 * positions in <tt>centroids</tt> to the respective positions in the |
| <a name="l00264"></a>00264 * previous iteration. |
| <a name="l00265"></a>00265 * - <tt>num_iterations</tt> - Number of iterations performed (so far). |
| <a name="l00266"></a>00266 */ |
| <a name="l00267"></a>00267 CREATE TYPE MADLIB_SCHEMA.kmeans_state AS ( |
| <a name="l00268"></a>00268 centroids DOUBLE PRECISION[][], |
| <a name="l00269"></a>00269 old_centroid_ids INTEGER[], |
| <a name="l00270"></a>00270 objective_fn DOUBLE PRECISION, |
| <a name="l00271"></a>00271 frac_reassigned DOUBLE PRECISION |
| <a name="l00272"></a>00272 ); |
| <a name="l00273"></a>00273 <span class="comment"></span> |
| <a name="l00274"></a>00274 <span class="comment">/**</span> |
| <a name="l00275"></a>00275 <span class="comment"> * @internal</span> |
| <a name="l00276"></a>00276 <span class="comment"> * @brief Execute a SQL command where $1, ..., $4 are substituted with the</span> |
| <a name="l00277"></a>00277 <span class="comment"> * given arguments.</span> |
| <a name="l00278"></a>00278 <span class="comment"> */</span> |
| <a name="l00279"></a>00279 CREATE FUNCTION MADLIB_SCHEMA.internal_execute_using_kmeans_args( |
| <a name="l00280"></a>00280 sql VARCHAR, DOUBLE PRECISION[][], REGPROC, INTEGER, DOUBLE PRECISION |
| <a name="l00281"></a>00281 ) RETURNS VOID |
| <a name="l00282"></a>00282 VOLATILE |
| <a name="l00283"></a>00283 CALLED ON NULL INPUT |
| <a name="l00284"></a>00284 LANGUAGE c |
| <a name="l00285"></a>00285 AS 'MODULE_PATHNAME<span class="stringliteral">', '</span>exec_sql_using<span class="stringliteral">';</span> |
| <a name="l00286"></a>00286 <span class="stringliteral"></span> |
| <a name="l00287"></a>00287 <span class="stringliteral">CREATE FUNCTION MADLIB_SCHEMA.internal_compute_kmeans(</span> |
| <a name="l00288"></a>00288 <span class="stringliteral"> rel_args VARCHAR,</span> |
| <a name="l00289"></a>00289 <span class="stringliteral"> rel_state VARCHAR,</span> |
| <a name="l00290"></a>00290 <span class="stringliteral"> rel_source VARCHAR,</span> |
| <a name="l00291"></a>00291 <span class="stringliteral"> expr_point VARCHAR,</span> |
| <a name="l00292"></a>00292 <span class="stringliteral"> agg_centroid VARCHAR)</span> |
| <a name="l00293"></a>00293 <span class="stringliteral">RETURNS INTEGER</span> |
| <a name="l00294"></a>00294 <span class="stringliteral">VOLATILE</span> |
| <a name="l00295"></a>00295 <span class="stringliteral">LANGUAGE plpythonu</span> |
| <a name="l00296"></a>00296 <span class="stringliteral">AS $$PythonFunction(kmeans, kmeans, compute_kmeans)$$;</span> |
| <a name="l00297"></a>00297 <span class="stringliteral"></span><span class="comment"></span> |
| <a name="l00298"></a>00298 <span class="comment">/**</span> |
| <a name="l00299"></a>00299 <span class="comment"> * @brief Perform Lloyd's k-means local-search heuristic</span> |
| <a name="l00300"></a>00300 <span class="comment"> *</span> |
| <a name="l00301"></a>00301 <span class="comment"> * @param rel_source Name of the relation containing input points</span> |
| <a name="l00302"></a>00302 <span class="comment"> * @param expr_point Expression evaluating to point coordinates for each tuple</span> |
| <a name="l00303"></a>00303 <span class="comment"> * @param initial_centroids Matrix containing the initial centroids as columns</span> |
| <a name="l00304"></a>00304 <span class="comment"> * @param fn_dist Name of a function with signature</span> |
| <a name="l00305"></a>00305 <span class="comment"> * <tt>DOUBLE PRECISION[] x DOUBLE PRECISION[] -> DOUBLE PRECISION</tt> that</span> |
| <a name="l00306"></a>00306 <span class="comment"> * returns the distance between two points. The default is the</span> |
| <a name="l00307"></a>00307 <span class="comment"> * \ref squared_dist_norm2(float8[],float8[]) "squared Euclidean distance".</span> |
| <a name="l00308"></a>00308 <span class="comment"> * @param agg_centroid Name of an aggregate function with signature</span> |
| <a name="l00309"></a>00309 <span class="comment"> * <tt>DOUBLE PRECISION[] -> DOUBLE PRECISION[]</tt> that, for each group</span> |
| <a name="l00310"></a>00310 <span class="comment"> * of points, returns a centroid. In order for Lloyd's local-search</span> |
| <a name="l00311"></a>00311 <span class="comment"> * heuristic to provably converge and to return a local minimum, this</span> |
| <a name="l00312"></a>00312 <span class="comment"> * centroid should minimize the sum of distances between each point in the</span> |
| <a name="l00313"></a>00313 <span class="comment"> * group and the centroid. The default is the</span> |
| <a name="l00314"></a>00314 <span class="comment"> * \ref avg(float8[]) "average (mean/barycenter in Euclidean space)",</span> |
| <a name="l00315"></a>00315 <span class="comment"> * which satisfies this property if <tt>fn_dist = 'squared_dist_norm2'</tt>.</span> |
| <a name="l00316"></a>00316 <span class="comment"> * @param max_num_iterations Maximum number of iterations</span> |
| <a name="l00317"></a>00317 <span class="comment"> * @param min_frac_reassigned Fraction of reassigned points below which</span> |
| <a name="l00318"></a>00318 <span class="comment"> * convergence is assumed and the algorithm terminates</span> |
| <a name="l00319"></a>00319 <span class="comment"> * @returns A composite value:</span> |
| <a name="l00320"></a>00320 <span class="comment"> * - <tt>centroids</tt> - Matrix with \f$ k \f$ centroids as columns.</span> |
| <a name="l00321"></a>00321 <span class="comment"> * - <tt>frac_reassigned</tt> - Fraction of points that were assigned a</span> |
| <a name="l00322"></a>00322 <span class="comment"> * different centroid in the last iteration.</span> |
| <a name="l00323"></a>00323 <span class="comment"> * - <tt>num_iterations</tt> - The number of iterations before the</span> |
| <a name="l00324"></a>00324 <span class="comment"> * algorithm terminated</span> |
| <a name="l00325"></a>00325 <span class="comment"> */</span> |
| <a name="l00326"></a>00326 CREATE FUNCTION MADLIB_SCHEMA.kmeans( |
| <a name="l00327"></a>00327 rel_source VARCHAR, |
| <a name="l00328"></a>00328 expr_point VARCHAR, |
| <a name="l00329"></a>00329 initial_centroids DOUBLE PRECISION[][], |
| <a name="l00330"></a>00330 fn_dist VARCHAR /*+ DEFAULT '<a class="code" href="linalg_8sql__in.html#a00a08e69f27524f2096032214e15b668" title="Squared 2-norm of the difference between two vectors.">squared_dist_norm2</a><span class="stringliteral">' */,</span> |
| <a name="l00331"></a>00331 <span class="stringliteral"> agg_centroid VARCHAR /*+ DEFAULT '</span><a class="code" href="linalg_8sql__in.html#a1aa37f73fb1cd8d7d106aa518dd8c0b4" title="Compute the average of vectors.">avg</a><span class="stringliteral">' */,</span> |
| <a name="l00332"></a>00332 <span class="stringliteral"> max_num_iterations INTEGER /*+ DEFAULT 20 */,</span> |
| <a name="l00333"></a>00333 <span class="stringliteral"> min_frac_reassigned DOUBLE PRECISION /*+ DEFAULT 0.001 */</span> |
| <a name="l00334"></a>00334 <span class="stringliteral">) RETURNS MADLIB_SCHEMA.kmeans_result AS $$</span> |
| <a name="l00335"></a>00335 <span class="stringliteral">DECLARE</span> |
| <a name="l00336"></a>00336 <span class="stringliteral"> theIteration INTEGER;</span> |
| <a name="l00337"></a>00337 <span class="stringliteral"> theResult MADLIB_SCHEMA.kmeans_result;</span> |
| <a name="l00338"></a>00338 <span class="stringliteral"> oldClientMinMessages VARCHAR;</span> |
| <a name="l00339"></a>00339 <span class="stringliteral"> class_rel_source REGCLASS;</span> |
| <a name="l00340"></a>00340 <span class="stringliteral"> proc_fn_dist REGPROCEDURE;</span> |
| <a name="l00341"></a>00341 <span class="stringliteral"> proc_agg_centroid REGPROCEDURE;</span> |
| <a name="l00342"></a>00342 <span class="stringliteral"> rel_filtered VARCHAR;</span> |
| <a name="l00343"></a>00343 <span class="stringliteral"> num_points INTEGER;</span> |
| <a name="l00344"></a>00344 <span class="stringliteral"> k INTEGER;</span> |
| <a name="l00345"></a>00345 <span class="stringliteral"> centroids FLOAT8[];</span> |
| <a name="l00346"></a>00346 <span class="stringliteral">BEGIN</span> |
| <a name="l00347"></a>00347 <span class="stringliteral"> IF (array_upper(initial_centroids,1) IS NULL) THEN</span> |
| <a name="l00348"></a><a class="code" href="kmeans_8sql__in.html#ae8bb21bf12220aa9de82792376afab7d">00348</a> <span class="stringliteral"> RAISE EXCEPTION '</span>No valid initial centroids given.<span class="stringliteral">';</span> |
| <a name="l00349"></a>00349 <span class="stringliteral"> END IF;</span> |
| <a name="l00350"></a>00350 <span class="stringliteral"></span> |
| <a name="l00351"></a>00351 <span class="stringliteral"> centroids := ARRAY(SELECT unnest(initial_centroids));</span> |
| <a name="l00352"></a>00352 <span class="stringliteral"> IF (SELECT MADLIB_SCHEMA.svec_elsum(centroids)) >= '</span>Infinity<span class="stringliteral">'::float THEN</span> |
| <a name="l00353"></a>00353 <span class="stringliteral"> RAISE EXCEPTION '</span>At least one initial centroid has non-finite values.<span class="stringliteral">';</span> |
| <a name="l00354"></a>00354 <span class="stringliteral"> END IF;</span> |
| <a name="l00355"></a>00355 <span class="stringliteral"></span> |
| <a name="l00356"></a>00356 <span class="stringliteral"> rel_filtered = MADLIB_SCHEMA.__filter_input_relation(rel_source, expr_point);</span> |
| <a name="l00357"></a>00357 <span class="stringliteral"> class_rel_source := rel_filtered;</span> |
| <a name="l00358"></a>00358 <span class="stringliteral"> proc_fn_dist := fn_dist</span> |
| <a name="l00359"></a>00359 <span class="stringliteral"> || '</span>(DOUBLE PRECISION[], DOUBLE PRECISION[])<span class="stringliteral">';</span> |
| <a name="l00360"></a>00360 <span class="stringliteral"> IF (SELECT prorettype != '</span>DOUBLE PRECISION<span class="stringliteral">'::regtype OR proisagg = TRUE</span> |
| <a name="l00361"></a>00361 <span class="stringliteral"> FROM pg_proc WHERE oid = proc_fn_dist) THEN</span> |
| <a name="l00362"></a>00362 <span class="stringliteral"> RAISE EXCEPTION '</span>Distance <span class="keyword">function</span> has wrong signature or is not a simple <span class="keyword">function</span>.<span class="stringliteral">';</span> |
| <a name="l00363"></a>00363 <span class="stringliteral"> END IF;</span> |
| <a name="l00364"></a>00364 <span class="stringliteral"> proc_agg_centroid := agg_centroid || '</span>(DOUBLE PRECISION[])<span class="stringliteral">';</span> |
| <a name="l00365"></a>00365 <span class="stringliteral"> IF (SELECT prorettype != '</span>DOUBLE PRECISION[]<span class="stringliteral">'::regtype OR proisagg = FALSE</span> |
| <a name="l00366"></a>00366 <span class="stringliteral"> FROM pg_proc WHERE oid = proc_agg_centroid) THEN</span> |
| <a name="l00367"></a>00367 <span class="stringliteral"> RAISE EXCEPTION '</span>Mean aggregate has wrong signature or is not an aggregate.<span class="stringliteral">';</span> |
| <a name="l00368"></a>00368 <span class="stringliteral"> END IF;</span> |
| <a name="l00369"></a>00369 <span class="stringliteral"> IF (min_frac_reassigned < 0) OR (min_frac_reassigned > 1) THEN</span> |
| <a name="l00370"></a>00370 <span class="stringliteral"> RAISE EXCEPTION '</span>Convergence threshold is not a valid value (must be a fraction between 0 and 1).<span class="stringliteral">';</span> |
| <a name="l00371"></a>00371 <span class="stringliteral"> END IF;</span> |
| <a name="l00372"></a>00372 <span class="stringliteral"> IF (max_num_iterations < 0) THEN</span> |
| <a name="l00373"></a>00373 <span class="stringliteral"> RAISE EXCEPTION '</span>Number of iterations must be a non-negative integer.<span class="stringliteral">';</span> |
| <a name="l00374"></a>00374 <span class="stringliteral"> END IF;</span> |
| <a name="l00375"></a>00375 <span class="stringliteral"></span> |
| <a name="l00376"></a>00376 <span class="stringliteral"> -- Extra parameter check added so that ERROR output is more user-readable (doesn'</span>t include Python traceback) |
| <a name="l00377"></a>00377 k := array_upper(initial_centroids,1); |
| <a name="l00378"></a>00378 IF (k <= 0) THEN |
| <a name="l00379"></a>00379 RAISE EXCEPTION 'Number of clusters k must be a positive integer.'; |
| <a name="l00380"></a>00380 END IF; |
| <a name="l00381"></a>00381 IF (k > 32767) THEN |
| <a name="l00382"></a>00382 RAISE EXCEPTION 'Number of clusters k must be <= 32767 (for results to be returned in a reasonable amount of time).'; |
| <a name="l00383"></a>00383 END IF; |
| <a name="l00384"></a>00384 EXECUTE $sql$ SELECT count(*) FROM $sql$ || textin(regclassout(class_rel_source)) INTO num_points ; |
| <a name="l00385"></a>00385 IF (num_points < k) THEN |
| <a name="l00386"></a>00386 RAISE EXCEPTION 'Number of centroids is greater than number of points.'; |
| <a name="l00387"></a>00387 END IF; |
| <a name="l00388"></a>00388 |
| <a name="l00389"></a>00389 -- We first setup the argument table. Rationale: We want to avoid all data |
| <a name="l00390"></a>00390 -- conversion between native types and Python code. Instead, we use Python |
| <a name="l00391"></a>00391 -- as a pure driver layer. |
| <a name="l00392"></a>00392 PERFORM MADLIB_SCHEMA.<a class="code" href="utilities_8sql__in.html#a56501b6f9fabe65d7a6a6beb70a0e000" title="Create the temporary schema if it does not exist yet.">create_schema_pg_temp</a>(); |
| <a name="l00393"></a>00393 oldClientMinMessages := |
| <a name="l00394"></a>00394 (SELECT setting FROM pg_settings WHERE name = 'client_min_messages'); |
| <a name="l00395"></a>00395 EXECUTE 'SET client_min_messages TO warning'; |
| <a name="l00396"></a>00396 |
| <a name="l00397"></a>00397 -- Unfortunately, the EXECUTE USING syntax is only available starting |
| <a name="l00398"></a>00398 -- PostgreSQL 8.4: |
| <a name="l00399"></a>00399 -- http:<span class="comment">//www.postgresql.org/docs/8.4/static/plpgsql-statements.html#PLPGSQL-STATEMENTS-EXECUTING-DYN</span> |
| <a name="l00400"></a>00400 -- We therefore have to emulate. |
| <a name="l00401"></a>00401 PERFORM MADLIB_SCHEMA.internal_execute_using_kmeans_args($sql$ |
| <a name="l00402"></a>00402 DROP TABLE IF EXISTS pg_temp._madlib_kmeans_args; |
| <a name="l00403"></a>00403 CREATE TABLE pg_temp._madlib_kmeans_args AS |
| <a name="l00404"></a>00404 SELECT |
| <a name="l00405"></a>00405 $1 AS initial_centroids, array_upper($1, 1) AS k, |
| <a name="l00406"></a>00406 $2 AS fn_dist, $3 AS max_num_iterations, |
| <a name="l00407"></a>00407 $4 AS min_frac_reassigned; |
| <a name="l00408"></a>00408 $sql$, |
| <a name="l00409"></a>00409 initial_centroids, proc_fn_dist, max_num_iterations, |
| <a name="l00410"></a>00410 min_frac_reassigned); |
| <a name="l00411"></a>00411 EXECUTE 'SET client_min_messages TO ' || oldClientMinMessages; |
| <a name="l00412"></a>00412 |
| <a name="l00413"></a>00413 -- Perform acutal computation. |
| <a name="l00414"></a>00414 -- Unfortunately, Greenplum and PostgreSQL <= 8.2 do not have conversion |
| <a name="l00415"></a>00415 -- operators from regclass to varchar/text. |
| <a name="l00416"></a>00416 theIteration := MADLIB_SCHEMA.internal_compute_kmeans('_madlib_kmeans_args', |
| <a name="l00417"></a>00417 '_madlib_kmeans_state', |
| <a name="l00418"></a>00418 textin(regclassout(class_rel_source)), expr_point, |
| <a name="l00419"></a>00419 textin(regprocout(proc_agg_centroid))); |
| <a name="l00420"></a>00420 |
| <a name="l00421"></a>00421 -- Retrieve result from state table and return it |
| <a name="l00422"></a>00422 EXECUTE |
| <a name="l00423"></a>00423 $sql$ |
| <a name="l00424"></a>00424 SELECT (_state).centroids, (_state).objective_fn, |
| <a name="l00425"></a>00425 (_state).frac_reassigned, NULL |
| <a name="l00426"></a>00426 FROM _madlib_kmeans_state |
| <a name="l00427"></a>00427 WHERE _iteration = $sql$ || theIteration || $sql$ |
| <a name="l00428"></a>00428 $sql$ |
| <a name="l00429"></a>00429 INTO theResult; |
| <a name="l00430"></a>00430 -- The number of iterations are not updated in the C++ code. We do it here. |
| <a name="l00431"></a>00431 IF NOT (theResult IS NULL) THEN |
| <a name="l00432"></a>00432 theResult.num_iterations = theIteration; |
| <a name="l00433"></a>00433 END IF; |
| <a name="l00434"></a>00434 RETURN theResult; |
| <a name="l00435"></a>00435 END; |
| <a name="l00436"></a>00436 $$ LANGUAGE plpgsql VOLATILE; |
| <a name="l00437"></a>00437 |
| <a name="l00438"></a>00438 CREATE FUNCTION MADLIB_SCHEMA.<a class="code" href="kmeans_8sql__in.html#ae8bb21bf12220aa9de82792376afab7d" title="Perform Lloyd's k-means local-search heuristic.">kmeans</a>( |
| <a name="l00439"></a>00439 rel_source VARCHAR, |
| <a name="l00440"></a>00440 expr_point VARCHAR, |
| <a name="l00441"></a>00441 initial_centroids DOUBLE PRECISION[][], |
| <a name="l00442"></a>00442 fn_dist VARCHAR, |
| <a name="l00443"></a>00443 agg_centroid VARCHAR, |
| <a name="l00444"></a>00444 max_num_iterations INTEGER |
| <a name="l00445"></a>00445 ) RETURNS MADLIB_SCHEMA.kmeans_result |
| <a name="l00446"></a>00446 VOLATILE |
| <a name="l00447"></a>00447 STRICT |
| <a name="l00448"></a>00448 LANGUAGE sql AS $$ |
| <a name="l00449"></a>00449 SELECT MADLIB_SCHEMA.<a class="code" href="kmeans_8sql__in.html#ae8bb21bf12220aa9de82792376afab7d" title="Perform Lloyd's k-means local-search heuristic.">kmeans</a>($1, $2, $3, $4, $5, $6, 0.001) |
| <a name="l00450"></a>00450 $$; |
| <a name="l00451"></a>00451 |
| <a name="l00452"></a>00452 CREATE FUNCTION MADLIB_SCHEMA.kmeans( |
| <a name="l00453"></a>00453 rel_source VARCHAR, |
| <a name="l00454"></a>00454 expr_point VARCHAR, |
| <a name="l00455"></a>00455 initial_centroids DOUBLE PRECISION[][], |
| <a name="l00456"></a>00456 fn_dist VARCHAR, |
| <a name="l00457"></a>00457 agg_centroid VARCHAR |
| <a name="l00458"></a>00458 ) RETURNS MADLIB_SCHEMA.kmeans_result |
| <a name="l00459"></a>00459 VOLATILE |
| <a name="l00460"></a>00460 STRICT |
| <a name="l00461"></a>00461 LANGUAGE sql AS $$ |
| <a name="l00462"></a>00462 SELECT MADLIB_SCHEMA.kmeans($1, $2, $3, $4, $5, 20, 0.001) |
| <a name="l00463"></a>00463 $$; |
| <a name="l00464"></a>00464 |
| <a name="l00465"></a>00465 CREATE FUNCTION MADLIB_SCHEMA.kmeans( |
| <a name="l00466"></a>00466 rel_source VARCHAR, |
| <a name="l00467"></a>00467 expr_point VARCHAR, |
| <a name="l00468"></a>00468 initial_centroids DOUBLE PRECISION[][], |
| <a name="l00469"></a>00469 fn_dist VARCHAR |
| <a name="l00470"></a>00470 ) RETURNS MADLIB_SCHEMA.kmeans_result |
| <a name="l00471"></a>00471 VOLATILE |
| <a name="l00472"></a>00472 STRICT |
| <a name="l00473"></a>00473 LANGUAGE sql AS $$ |
| <a name="l00474"></a>00474 SELECT MADLIB_SCHEMA.kmeans($1, $2, $3, $4, 'MADLIB_SCHEMA.<a class="code" href="linalg_8sql__in.html#a1aa37f73fb1cd8d7d106aa518dd8c0b4" title="Compute the average of vectors.">avg</a>', 20, |
| <a name="l00475"></a>00475 0.001) |
| <a name="l00476"></a>00476 $$; |
| <a name="l00477"></a>00477 |
| <a name="l00478"></a>00478 CREATE FUNCTION MADLIB_SCHEMA.kmeans( |
| <a name="l00479"></a>00479 rel_source VARCHAR, |
| <a name="l00480"></a>00480 expr_point VARCHAR, |
| <a name="l00481"></a>00481 initial_centroids DOUBLE PRECISION[][] |
| <a name="l00482"></a>00482 ) RETURNS MADLIB_SCHEMA.kmeans_result |
| <a name="l00483"></a>00483 VOLATILE |
| <a name="l00484"></a>00484 STRICT |
| <a name="l00485"></a>00485 LANGUAGE sql AS $$ |
| <a name="l00486"></a>00486 SELECT MADLIB_SCHEMA.kmeans($1, $2, $3, |
| <a name="l00487"></a>00487 'MADLIB_SCHEMA.<a class="code" href="linalg_8sql__in.html#a00a08e69f27524f2096032214e15b668" title="Squared 2-norm of the difference between two vectors.">squared_dist_norm2</a>', 'MADLIB_SCHEMA.avg', 20, 0.001) |
| <a name="l00488"></a>00488 $$; |
| <a name="l00489"></a>00489 |
| <a name="l00490"></a>00490 CREATE FUNCTION MADLIB_SCHEMA.internal_execute_using_kmeanspp_seeding_args( |
| <a name="l00491"></a>00491 sql VARCHAR, INTEGER, REGPROC, DOUBLE PRECISION[][] |
| <a name="l00492"></a>00492 ) RETURNS VOID |
| <a name="l00493"></a>00493 VOLATILE |
| <a name="l00494"></a>00494 CALLED ON NULL INPUT |
| <a name="l00495"></a>00495 LANGUAGE c |
| <a name="l00496"></a>00496 AS 'MODULE_PATHNAME', 'exec_sql_using'; |
| <a name="l00497"></a>00497 |
| <a name="l00498"></a>00498 CREATE FUNCTION MADLIB_SCHEMA.internal_compute_kmeanspp_seeding( |
| <a name="l00499"></a>00499 rel_args VARCHAR, |
| <a name="l00500"></a>00500 rel_state VARCHAR, |
| <a name="l00501"></a>00501 rel_source VARCHAR, |
| <a name="l00502"></a>00502 expr_point VARCHAR) |
| <a name="l00503"></a>00503 RETURNS INTEGER |
| <a name="l00504"></a>00504 AS $$PythonFunction(kmeans, kmeans, compute_kmeanspp_seeding)$$ |
| <a name="l00505"></a>00505 LANGUAGE plpythonu VOLATILE; |
| <a name="l00506"></a>00506 <span class="comment"></span> |
| <a name="l00507"></a>00507 <span class="comment">/**</span> |
| <a name="l00508"></a>00508 <span class="comment"> * @brief k-Means++ Seeding</span> |
| <a name="l00509"></a>00509 <span class="comment"> *</span> |
| <a name="l00510"></a>00510 <span class="comment"> * @param rel_source Name of the relation containing input points</span> |
| <a name="l00511"></a>00511 <span class="comment"> * @param expr_point Expression evaluating to point coordinates for each tuple</span> |
| <a name="l00512"></a>00512 <span class="comment"> * @param k Number of centroids</span> |
| <a name="l00513"></a>00513 <span class="comment"> * @param fn_dist Name of a function with signature</span> |
| <a name="l00514"></a>00514 <span class="comment"> * <tt>DOUBLE PRECISION[] x DOUBLE PRECISION[] -> DOUBLE PRECISION</tt> that</span> |
| <a name="l00515"></a>00515 <span class="comment"> * returns the distance between two points</span> |
| <a name="l00516"></a>00516 <span class="comment"> * @param initial_centroids A matrix containing up to \f$ k \f$ columns as</span> |
| <a name="l00517"></a>00517 <span class="comment"> * columns. kmeanspp_seeding() proceeds exactly as if these centroids had</span> |
| <a name="l00518"></a>00518 <span class="comment"> * already been generated in previous iterations. This parameter may be</span> |
| <a name="l00519"></a>00519 <span class="comment"> * NULL in which all \f$ k \f$ centroids will be generated.</span> |
| <a name="l00520"></a>00520 <span class="comment"> * @returns A matrix containing \f$ k \f$ centroids as columns</span> |
| <a name="l00521"></a>00521 <span class="comment"> */</span> |
| <a name="l00522"></a>00522 CREATE FUNCTION MADLIB_SCHEMA.<a class="code" href="kmeans_8sql__in.html#af0d5172211c83d4de4d70a84555aa68e" title="k-Means++ Seeding">kmeanspp_seeding</a>( |
| <a name="l00523"></a>00523 rel_source VARCHAR, |
| <a name="l00524"></a>00524 expr_point VARCHAR, |
| <a name="l00525"></a>00525 k INTEGER, |
| <a name="l00526"></a>00526 fn_dist VARCHAR <span class="comment">/*+ DEFAULT 'squared_dist_norm2' */</span>, |
| <a name="l00527"></a>00527 initial_centroids DOUBLE PRECISION[][] <span class="comment">/*+ DEFAULT NULL */</span> |
| <a name="l00528"></a>00528 ) RETURNS DOUBLE PRECISION[][] AS $$ |
| <a name="l00529"></a>00529 DECLARE |
| <a name="l00530"></a>00530 theIteration INTEGER; |
| <a name="l00531"></a>00531 theResult DOUBLE PRECISION[][]; |
| <a name="l00532"></a>00532 oldClientMinMessages VARCHAR; |
| <a name="l00533"></a>00533 class_rel_source REGCLASS; |
| <a name="l00534"></a>00534 proc_fn_dist REGPROCEDURE; |
| <a name="l00535"></a>00535 num_points INTEGER; |
| <a name="l00536"></a>00536 num_centroids INTEGER; |
| <a name="l00537"></a>00537 rel_filtered VARCHAR; |
| <a name="l00538"></a>00538 BEGIN |
| <a name="l00539"></a>00539 rel_filtered = MADLIB_SCHEMA.__filter_input_relation(rel_source, expr_point); |
| <a name="l00540"></a>00540 class_rel_source := rel_filtered; |
| <a name="l00541"></a>00541 |
| <a name="l00542"></a>00542 IF (initial_centroids IS NOT NULL) THEN |
| <a name="l00543"></a>00543 num_centroids := array_upper(initial_centroids,1); |
| <a name="l00544"></a><a class="code" href="kmeans_8sql__in.html#af0d5172211c83d4de4d70a84555aa68e">00544</a> ELSE |
| <a name="l00545"></a>00545 num_centroids := k; |
| <a name="l00546"></a>00546 END IF; |
| <a name="l00547"></a>00547 |
| <a name="l00548"></a>00548 proc_fn_dist := fn_dist |
| <a name="l00549"></a>00549 || '(DOUBLE PRECISION[], DOUBLE PRECISION[])'; |
| <a name="l00550"></a>00550 IF (SELECT prorettype != 'DOUBLE PRECISION'::regtype OR proisagg = TRUE |
| <a name="l00551"></a>00551 FROM pg_proc WHERE oid = proc_fn_dist) THEN |
| <a name="l00552"></a>00552 RAISE EXCEPTION 'Distance function has wrong signature or is not a simple function.'; |
| <a name="l00553"></a>00553 END IF; |
| <a name="l00554"></a>00554 IF (k <= 0) THEN |
| <a name="l00555"></a>00555 RAISE EXCEPTION 'Number of clusters k must be a positive integer.'; |
| <a name="l00556"></a>00556 END IF; |
| <a name="l00557"></a>00557 IF (k > 32767) THEN |
| <a name="l00558"></a>00558 RAISE EXCEPTION 'Number of clusters k must be <= 32767 (for results to be returned in a reasonable amount of time).'; |
| <a name="l00559"></a>00559 END IF; |
| <a name="l00560"></a>00560 EXECUTE $sql$ SELECT count(*) FROM $sql$ || textin(regclassout(class_rel_source)) INTO num_points ; |
| <a name="l00561"></a>00561 IF (num_points < k OR num_points < num_centroids) THEN |
| <a name="l00562"></a>00562 RAISE EXCEPTION 'Number of centroids is greater than number of points.'; |
| <a name="l00563"></a>00563 END IF; |
| <a name="l00564"></a>00564 IF (k < num_centroids) THEN |
| <a name="l00565"></a>00565 RAISE WARNING 'Number of clusters k is less than number of supplied initial centroids. Number of final clusters will equal number of supplied initial centroids.'; |
| <a name="l00566"></a>00566 END IF; |
| <a name="l00567"></a>00567 |
| <a name="l00568"></a>00568 -- We first setup the argument table. Rationale: We want to avoid all data |
| <a name="l00569"></a>00569 -- conversion between native types and Python code. Instead, we use Python |
| <a name="l00570"></a>00570 -- as a pure driver layer. |
| <a name="l00571"></a>00571 oldClientMinMessages := |
| <a name="l00572"></a>00572 (SELECT setting FROM pg_settings WHERE name = 'client_min_messages'); |
| <a name="l00573"></a>00573 EXECUTE 'SET client_min_messages TO warning'; |
| <a name="l00574"></a>00574 PERFORM MADLIB_SCHEMA.create_schema_pg_temp(); |
| <a name="l00575"></a>00575 -- Unfortunately, the EXECUTE USING syntax is only available starting |
| <a name="l00576"></a>00576 -- PostgreSQL 8.4: |
| <a name="l00577"></a>00577 -- http:<span class="comment">//www.postgresql.org/docs/8.4/static/plpgsql-statements.html#PLPGSQL-STATEMENTS-EXECUTING-DYN</span> |
| <a name="l00578"></a>00578 -- We therefore have to emulate. |
| <a name="l00579"></a>00579 PERFORM MADLIB_SCHEMA.internal_execute_using_kmeanspp_seeding_args($sql$ |
| <a name="l00580"></a>00580 DROP TABLE IF EXISTS pg_temp._madlib_kmeanspp_args; |
| <a name="l00581"></a>00581 CREATE TEMPORARY TABLE _madlib_kmeanspp_args AS |
| <a name="l00582"></a>00582 SELECT $1 AS k, $2 AS fn_dist, $3 AS initial_centroids; |
| <a name="l00583"></a>00583 $sql$, |
| <a name="l00584"></a>00584 k, proc_fn_dist, initial_centroids); |
| <a name="l00585"></a>00585 EXECUTE 'SET client_min_messages TO ' || oldClientMinMessages; |
| <a name="l00586"></a>00586 |
| <a name="l00587"></a>00587 -- Perform acutal computation. |
| <a name="l00588"></a>00588 -- Unfortunately, Greenplum and PostgreSQL <= 8.2 do not have conversion |
| <a name="l00589"></a>00589 -- operators from regclass to varchar/text. |
| <a name="l00590"></a>00590 theIteration := ( |
| <a name="l00591"></a>00591 SELECT MADLIB_SCHEMA.internal_compute_kmeanspp_seeding( |
| <a name="l00592"></a>00592 '_madlib_kmeanspp_args', '_madlib_kmeanspp_state', |
| <a name="l00593"></a>00593 textin(regclassout(class_rel_source)), expr_point) |
| <a name="l00594"></a>00594 ); |
| <a name="l00595"></a>00595 |
| <a name="l00596"></a>00596 -- Retrieve result from state table and return it |
| <a name="l00597"></a>00597 EXECUTE |
| <a name="l00598"></a>00598 $sql$ |
| <a name="l00599"></a>00599 SELECT _state FROM _madlib_kmeanspp_state |
| <a name="l00600"></a>00600 WHERE _iteration = $sql$ || theIteration || $sql$ |
| <a name="l00601"></a>00601 $sql$ |
| <a name="l00602"></a>00602 INTO theResult; |
| <a name="l00603"></a>00603 RETURN theResult; |
| <a name="l00604"></a>00604 END; |
| <a name="l00605"></a>00605 $$ LANGUAGE plpgsql VOLATILE; |
| <a name="l00606"></a>00606 |
| <a name="l00607"></a>00607 CREATE FUNCTION MADLIB_SCHEMA.<a class="code" href="kmeans_8sql__in.html#af0d5172211c83d4de4d70a84555aa68e" title="k-Means++ Seeding">kmeanspp_seeding</a>( |
| <a name="l00608"></a>00608 rel_source VARCHAR, |
| <a name="l00609"></a>00609 expr_point VARCHAR, |
| <a name="l00610"></a>00610 k INTEGER, |
| <a name="l00611"></a>00611 fn_dist VARCHAR |
| <a name="l00612"></a>00612 ) RETURNS DOUBLE PRECISION[][] |
| <a name="l00613"></a>00613 LANGUAGE sql AS $$ |
| <a name="l00614"></a>00614 SELECT MADLIB_SCHEMA.<a class="code" href="kmeans_8sql__in.html#af0d5172211c83d4de4d70a84555aa68e" title="k-Means++ Seeding">kmeanspp_seeding</a>($1, $2, $3, $4, NULL) |
| <a name="l00615"></a>00615 $$; |
| <a name="l00616"></a>00616 |
| <a name="l00617"></a>00617 CREATE FUNCTION MADLIB_SCHEMA.kmeanspp_seeding( |
| <a name="l00618"></a>00618 rel_source VARCHAR, |
| <a name="l00619"></a>00619 expr_point VARCHAR, |
| <a name="l00620"></a>00620 k INTEGER |
| <a name="l00621"></a>00621 ) RETURNS DOUBLE PRECISION[][] |
| <a name="l00622"></a>00622 LANGUAGE sql AS $$ |
| <a name="l00623"></a>00623 SELECT MADLIB_SCHEMA.kmeanspp_seeding($1, $2, $3, |
| <a name="l00624"></a>00624 'MADLIB_SCHEMA.squared_dist_norm2', NULL) |
| <a name="l00625"></a>00625 $$; |
| <a name="l00626"></a>00626 <span class="comment"></span> |
| <a name="l00627"></a>00627 <span class="comment">/**</span> |
| <a name="l00628"></a>00628 <span class="comment"> * @brief Run k-Means++.</span> |
| <a name="l00629"></a>00629 <span class="comment"> *</span> |
| <a name="l00630"></a>00630 <span class="comment"> * This is a shortcut for running k-means++. It is equivalent to</span> |
| <a name="l00631"></a>00631 <span class="comment"> * <pre>SELECT \ref kmeans(</span> |
| <a name="l00632"></a>00632 <span class="comment"> rel_source,</span> |
| <a name="l00633"></a>00633 <span class="comment"> expr_point,</span> |
| <a name="l00634"></a>00634 <span class="comment"> \ref kmeanspp_seeding(</span> |
| <a name="l00635"></a>00635 <span class="comment"> rel_source,</span> |
| <a name="l00636"></a>00636 <span class="comment"> expr_point,</span> |
| <a name="l00637"></a>00637 <span class="comment"> k,</span> |
| <a name="l00638"></a>00638 <span class="comment"> fn_dist</span> |
| <a name="l00639"></a>00639 <span class="comment"> ),</span> |
| <a name="l00640"></a>00640 <span class="comment"> fn_dist,</span> |
| <a name="l00641"></a>00641 <span class="comment"> agg_centroid,</span> |
| <a name="l00642"></a>00642 <span class="comment"> max_num_iterations,</span> |
| <a name="l00643"></a>00643 <span class="comment"> min_frac_reassigned</span> |
| <a name="l00644"></a>00644 <span class="comment">)</pre></span> |
| <a name="l00645"></a>00645 <span class="comment"> */</span> |
| <a name="l00646"></a>00646 CREATE FUNCTION MADLIB_SCHEMA.<a class="code" href="kmeans_8sql__in.html#ac6c26c8e6b4643acfa79a87bd3ab0fe4" title="Run k-Means++.">kmeanspp</a>( |
| <a name="l00647"></a>00647 rel_source VARCHAR, |
| <a name="l00648"></a>00648 expr_point VARCHAR, |
| <a name="l00649"></a>00649 k INTEGER, |
| <a name="l00650"></a>00650 fn_dist VARCHAR <span class="comment">/*+ DEFAULT 'squared_dist_norm2' */</span>, |
| <a name="l00651"></a>00651 agg_centroid VARCHAR <span class="comment">/*+ DEFAULT 'avg' */</span>, |
| <a name="l00652"></a>00652 max_num_iterations INTEGER <span class="comment">/*+ DEFAULT 20 */</span>, |
| <a name="l00653"></a>00653 min_frac_reassigned DOUBLE PRECISION <span class="comment">/*+ DEFAULT 0.001 */</span> |
| <a name="l00654"></a>00654 ) RETURNS MADLIB_SCHEMA.kmeans_result |
| <a name="l00655"></a>00655 VOLATILE |
| <a name="l00656"></a>00656 STRICT |
| <a name="l00657"></a>00657 LANGUAGE plpgsql |
| <a name="l00658"></a>00658 AS $$ |
| <a name="l00659"></a>00659 DECLARE |
| <a name="l00660"></a>00660 ret MADLIB_SCHEMA.kmeans_result; |
| <a name="l00661"></a>00661 BEGIN |
| <a name="l00662"></a>00662 ret = MADLIB_SCHEMA.kmeans( |
| <a name="l00663"></a>00663 $1, $2, MADLIB_SCHEMA.kmeanspp_seeding($1, $2, $3, $4), |
| <a name="l00664"></a>00664 $4, $5, $6, $7); |
| <a name="l00665"></a>00665 RETURN ret; |
| <a name="l00666"></a>00666 END |
| <a name="l00667"></a>00667 $$; |
| <a name="l00668"></a><a class="code" href="kmeans_8sql__in.html#ac6c26c8e6b4643acfa79a87bd3ab0fe4">00668</a> |
| <a name="l00669"></a>00669 CREATE FUNCTION MADLIB_SCHEMA.<a class="code" href="kmeans_8sql__in.html#ac6c26c8e6b4643acfa79a87bd3ab0fe4" title="Run k-Means++.">kmeanspp</a>( |
| <a name="l00670"></a>00670 rel_source VARCHAR, |
| <a name="l00671"></a>00671 expr_point VARCHAR, |
| <a name="l00672"></a>00672 k INTEGER, |
| <a name="l00673"></a>00673 fn_dist VARCHAR, |
| <a name="l00674"></a>00674 agg_centroid VARCHAR, |
| <a name="l00675"></a>00675 max_num_iterations INTEGER |
| <a name="l00676"></a>00676 ) RETURNS MADLIB_SCHEMA.kmeans_result |
| <a name="l00677"></a>00677 VOLATILE |
| <a name="l00678"></a>00678 STRICT |
| <a name="l00679"></a>00679 LANGUAGE plpgsql |
| <a name="l00680"></a>00680 AS $$ |
| <a name="l00681"></a>00681 DECLARE |
| <a name="l00682"></a>00682 ret MADLIB_SCHEMA.kmeans_result; |
| <a name="l00683"></a>00683 BEGIN |
| <a name="l00684"></a>00684 ret = MADLIB_SCHEMA.kmeans( |
| <a name="l00685"></a>00685 $1, $2, MADLIB_SCHEMA.kmeanspp_seeding($1, $2, $3, $4), |
| <a name="l00686"></a>00686 $4, $5, $6, 0.001); |
| <a name="l00687"></a>00687 RETURN ret; |
| <a name="l00688"></a>00688 END |
| <a name="l00689"></a>00689 $$; |
| <a name="l00690"></a>00690 |
| <a name="l00691"></a>00691 CREATE FUNCTION MADLIB_SCHEMA.<a class="code" href="kmeans_8sql__in.html#ac6c26c8e6b4643acfa79a87bd3ab0fe4" title="Run k-Means++.">kmeanspp</a>( |
| <a name="l00692"></a>00692 rel_source VARCHAR, |
| <a name="l00693"></a>00693 expr_point VARCHAR, |
| <a name="l00694"></a>00694 k INTEGER, |
| <a name="l00695"></a>00695 fn_dist VARCHAR, |
| <a name="l00696"></a>00696 agg_centroid VARCHAR |
| <a name="l00697"></a>00697 ) RETURNS MADLIB_SCHEMA.kmeans_result |
| <a name="l00698"></a>00698 VOLATILE |
| <a name="l00699"></a>00699 STRICT |
| <a name="l00700"></a>00700 LANGUAGE plpgsql |
| <a name="l00701"></a>00701 AS $$ |
| <a name="l00702"></a>00702 DECLARE |
| <a name="l00703"></a>00703 ret MADLIB_SCHEMA.kmeans_result; |
| <a name="l00704"></a>00704 BEGIN |
| <a name="l00705"></a>00705 ret = MADLIB_SCHEMA.kmeans( |
| <a name="l00706"></a>00706 $1, $2, MADLIB_SCHEMA.kmeanspp_seeding($1, $2, $3, $4), |
| <a name="l00707"></a>00707 $4, $5, 20, 0.001); |
| <a name="l00708"></a>00708 RETURN ret; |
| <a name="l00709"></a>00709 END |
| <a name="l00710"></a>00710 $$; |
| <a name="l00711"></a>00711 |
| <a name="l00712"></a>00712 CREATE FUNCTION MADLIB_SCHEMA.<a class="code" href="kmeans_8sql__in.html#ac6c26c8e6b4643acfa79a87bd3ab0fe4" title="Run k-Means++.">kmeanspp</a>( |
| <a name="l00713"></a>00713 rel_source VARCHAR, |
| <a name="l00714"></a>00714 expr_point VARCHAR, |
| <a name="l00715"></a>00715 k INTEGER, |
| <a name="l00716"></a>00716 fn_dist VARCHAR |
| <a name="l00717"></a>00717 ) RETURNS MADLIB_SCHEMA.kmeans_result |
| <a name="l00718"></a>00718 VOLATILE |
| <a name="l00719"></a>00719 STRICT |
| <a name="l00720"></a>00720 LANGUAGE plpgsql |
| <a name="l00721"></a>00721 AS $$ |
| <a name="l00722"></a>00722 DECLARE |
| <a name="l00723"></a>00723 ret MADLIB_SCHEMA.kmeans_result; |
| <a name="l00724"></a>00724 BEGIN |
| <a name="l00725"></a>00725 ret = MADLIB_SCHEMA.kmeans( |
| <a name="l00726"></a>00726 $1, $2, MADLIB_SCHEMA.kmeanspp_seeding($1, $2, $3, $4), |
| <a name="l00727"></a>00727 $4, 'MADLIB_SCHEMA.avg', 20, 0.001); |
| <a name="l00728"></a>00728 RETURN ret; |
| <a name="l00729"></a>00729 END |
| <a name="l00730"></a>00730 $$; |
| <a name="l00731"></a>00731 |
| <a name="l00732"></a>00732 CREATE FUNCTION MADLIB_SCHEMA.<a class="code" href="kmeans_8sql__in.html#ac6c26c8e6b4643acfa79a87bd3ab0fe4" title="Run k-Means++.">kmeanspp</a>( |
| <a name="l00733"></a>00733 rel_source VARCHAR, |
| <a name="l00734"></a>00734 expr_point VARCHAR, |
| <a name="l00735"></a>00735 k INTEGER |
| <a name="l00736"></a>00736 ) RETURNS MADLIB_SCHEMA.kmeans_result |
| <a name="l00737"></a>00737 VOLATILE |
| <a name="l00738"></a>00738 STRICT |
| <a name="l00739"></a>00739 LANGUAGE plpgsql |
| <a name="l00740"></a>00740 AS $$ |
| <a name="l00741"></a>00741 DECLARE |
| <a name="l00742"></a>00742 ret MADLIB_SCHEMA.kmeans_result; |
| <a name="l00743"></a>00743 BEGIN |
| <a name="l00744"></a>00744 ret = MADLIB_SCHEMA.kmeans( |
| <a name="l00745"></a>00745 $1, $2, |
| <a name="l00746"></a>00746 MADLIB_SCHEMA.kmeanspp_seeding($1, $2, $3, |
| <a name="l00747"></a>00747 'MADLIB_SCHEMA.squared_dist_norm2'), |
| <a name="l00748"></a>00748 'MADLIB_SCHEMA.squared_dist_norm2', 'MADLIB_SCHEMA.avg', 20, 0.001); |
| <a name="l00749"></a>00749 RETURN ret; |
| <a name="l00750"></a>00750 END |
| <a name="l00751"></a>00751 $$; |
| <a name="l00752"></a>00752 |
| <a name="l00753"></a>00753 CREATE FUNCTION MADLIB_SCHEMA.internal_execute_using_kmeans_random_seeding_args( |
| <a name="l00754"></a>00754 sql VARCHAR, INTEGER, DOUBLE PRECISION[][] |
| <a name="l00755"></a>00755 ) RETURNS VOID |
| <a name="l00756"></a>00756 VOLATILE |
| <a name="l00757"></a>00757 CALLED ON NULL INPUT |
| <a name="l00758"></a>00758 LANGUAGE c |
| <a name="l00759"></a>00759 AS 'MODULE_PATHNAME', 'exec_sql_using'; |
| <a name="l00760"></a>00760 |
| <a name="l00761"></a>00761 CREATE FUNCTION MADLIB_SCHEMA.internal_compute_kmeans_random_seeding( |
| <a name="l00762"></a>00762 rel_args VARCHAR, |
| <a name="l00763"></a>00763 rel_state VARCHAR, |
| <a name="l00764"></a>00764 rel_source VARCHAR, |
| <a name="l00765"></a>00765 expr_point VARCHAR) |
| <a name="l00766"></a>00766 RETURNS INTEGER |
| <a name="l00767"></a>00767 AS $$PythonFunction(kmeans, kmeans, compute_kmeans_random_seeding)$$ |
| <a name="l00768"></a>00768 LANGUAGE plpythonu VOLATILE; |
| <a name="l00769"></a>00769 <span class="comment"></span> |
| <a name="l00770"></a>00770 <span class="comment">/**</span> |
| <a name="l00771"></a>00771 <span class="comment"> * @brief k-Means Random Seeding</span> |
| <a name="l00772"></a>00772 <span class="comment"> *</span> |
| <a name="l00773"></a>00773 <span class="comment"> * @param rel_source Name of the relation containing input points</span> |
| <a name="l00774"></a>00774 <span class="comment"> * @param expr_point Expression evaluating to point coordinates for each tuple</span> |
| <a name="l00775"></a>00775 <span class="comment"> * @param k Number of centroids</span> |
| <a name="l00776"></a>00776 <span class="comment"> * @param initial_centroids A matrix containing up to \f$ k \f$ columns as</span> |
| <a name="l00777"></a>00777 <span class="comment"> * columns. kmeanspp_seeding() proceeds exactly as if these centroids had</span> |
| <a name="l00778"></a>00778 <span class="comment"> * already been generated in previous iterations. This parameter may be</span> |
| <a name="l00779"></a>00779 <span class="comment"> * NULL in which all \f$ k \f$ centroids will be generated.</span> |
| <a name="l00780"></a>00780 <span class="comment"> * @returns A matrix containing \f$ k \f$ centroids as columns</span> |
| <a name="l00781"></a>00781 <span class="comment"> */</span> |
| <a name="l00782"></a>00782 CREATE FUNCTION MADLIB_SCHEMA.<a class="code" href="kmeans_8sql__in.html#a01e02736e6d156240b15f7d6dae092c3" title="k-Means Random Seeding">kmeans_random_seeding</a>( |
| <a name="l00783"></a>00783 rel_source VARCHAR, |
| <a name="l00784"></a>00784 expr_point VARCHAR, |
| <a name="l00785"></a>00785 k INTEGER, |
| <a name="l00786"></a>00786 initial_centroids DOUBLE PRECISION[][] <span class="comment">/*+ DEFAULT NULL */</span> |
| <a name="l00787"></a>00787 ) RETURNS DOUBLE PRECISION[][] AS $$ |
| <a name="l00788"></a>00788 DECLARE |
| <a name="l00789"></a>00789 theIteration INTEGER; |
| <a name="l00790"></a>00790 theResult DOUBLE PRECISION[][]; |
| <a name="l00791"></a>00791 oldClientMinMessages VARCHAR; |
| <a name="l00792"></a>00792 class_rel_source REGCLASS; |
| <a name="l00793"></a>00793 proc_fn_dist REGPROCEDURE; |
| <a name="l00794"></a>00794 num_points INTEGER; |
| <a name="l00795"></a>00795 num_centroids INTEGER; |
| <a name="l00796"></a>00796 rel_filtered VARCHAR; |
| <a name="l00797"></a>00797 BEGIN |
| <a name="l00798"></a>00798 rel_filtered = MADLIB_SCHEMA.__filter_input_relation(rel_source, expr_point); |
| <a name="l00799"></a>00799 class_rel_source := rel_filtered; |
| <a name="l00800"></a>00800 |
| <a name="l00801"></a>00801 IF (initial_centroids IS NOT NULL) THEN |
| <a name="l00802"></a>00802 num_centroids := array_upper(initial_centroids,1); |
| <a name="l00803"></a>00803 ELSE |
| <a name="l00804"></a><a class="code" href="kmeans_8sql__in.html#a01e02736e6d156240b15f7d6dae092c3">00804</a> num_centroids := k; |
| <a name="l00805"></a>00805 END IF; |
| <a name="l00806"></a>00806 |
| <a name="l00807"></a>00807 IF (k <= 0) THEN |
| <a name="l00808"></a>00808 RAISE EXCEPTION 'Number of clusters k must be a positive integer.'; |
| <a name="l00809"></a>00809 END IF; |
| <a name="l00810"></a>00810 IF (k > 32767) THEN |
| <a name="l00811"></a>00811 RAISE EXCEPTION 'Number of clusters k must be <= 32767 (for results to be returned in a reasonable amount of time).'; |
| <a name="l00812"></a>00812 END IF; |
| <a name="l00813"></a>00813 EXECUTE $sql$ SELECT count(*) FROM $sql$ || textin(regclassout(class_rel_source)) INTO num_points; |
| <a name="l00814"></a>00814 IF (num_points < k OR num_points < num_centroids) THEN |
| <a name="l00815"></a>00815 RAISE EXCEPTION 'Number of centroids is greater than number of points.'; |
| <a name="l00816"></a>00816 END IF; |
| <a name="l00817"></a>00817 IF (k < num_centroids) THEN |
| <a name="l00818"></a>00818 RAISE WARNING 'Number of clusters k is less than number of supplied initial centroids. Number of final clusters will equal number of supplied initial centroids.'; |
| <a name="l00819"></a>00819 END IF; |
| <a name="l00820"></a>00820 |
| <a name="l00821"></a>00821 -- We first setup the argument table. Rationale: We want to avoid all data |
| <a name="l00822"></a>00822 -- conversion between native types and Python code. Instead, we use Python |
| <a name="l00823"></a>00823 -- as a pure driver layer. |
| <a name="l00824"></a>00824 oldClientMinMessages := |
| <a name="l00825"></a>00825 (SELECT setting FROM pg_settings WHERE name = 'client_min_messages'); |
| <a name="l00826"></a>00826 EXECUTE 'SET client_min_messages TO warning'; |
| <a name="l00827"></a>00827 PERFORM MADLIB_SCHEMA.create_schema_pg_temp(); |
| <a name="l00828"></a>00828 -- Unfortunately, the EXECUTE USING syntax is only available starting |
| <a name="l00829"></a>00829 -- PostgreSQL 8.4: |
| <a name="l00830"></a>00830 -- http:<span class="comment">//www.postgresql.org/docs/8.4/static/plpgsql-statements.html#PLPGSQL-STATEMENTS-EXECUTING-DYN</span> |
| <a name="l00831"></a>00831 -- We therefore have to emulate. |
| <a name="l00832"></a>00832 PERFORM MADLIB_SCHEMA.internal_execute_using_kmeans_random_seeding_args($sql$ |
| <a name="l00833"></a>00833 DROP TABLE IF EXISTS pg_temp._madlib_kmeans_random_args; |
| <a name="l00834"></a>00834 CREATE TEMPORARY TABLE _madlib_kmeans_random_args AS |
| <a name="l00835"></a>00835 SELECT $1 AS k, $2 AS initial_centroids; |
| <a name="l00836"></a>00836 $sql$, |
| <a name="l00837"></a>00837 k, initial_centroids); |
| <a name="l00838"></a>00838 EXECUTE 'SET client_min_messages TO ' || oldClientMinMessages; |
| <a name="l00839"></a>00839 |
| <a name="l00840"></a>00840 -- Perform acutal computation. |
| <a name="l00841"></a>00841 -- Unfortunately, Greenplum and PostgreSQL <= 8.2 do not have conversion |
| <a name="l00842"></a>00842 -- operators from regclass to varchar/text. |
| <a name="l00843"></a>00843 theIteration := ( |
| <a name="l00844"></a>00844 SELECT MADLIB_SCHEMA.internal_compute_kmeans_random_seeding( |
| <a name="l00845"></a>00845 '_madlib_kmeans_random_args', '_madlib_kmeans_random_state', |
| <a name="l00846"></a>00846 textin(regclassout(class_rel_source)), expr_point) |
| <a name="l00847"></a>00847 ); |
| <a name="l00848"></a>00848 |
| <a name="l00849"></a>00849 -- Retrieve result from state table and return it |
| <a name="l00850"></a>00850 EXECUTE |
| <a name="l00851"></a>00851 $sql$ |
| <a name="l00852"></a>00852 SELECT _state FROM _madlib_kmeans_random_state |
| <a name="l00853"></a>00853 WHERE _iteration = $sql$ || theIteration || $sql$ |
| <a name="l00854"></a>00854 $sql$ |
| <a name="l00855"></a>00855 INTO theResult; |
| <a name="l00856"></a>00856 RETURN theResult; |
| <a name="l00857"></a>00857 END; |
| <a name="l00858"></a>00858 $$ LANGUAGE plpgsql VOLATILE; |
| <a name="l00859"></a>00859 |
| <a name="l00860"></a>00860 CREATE FUNCTION MADLIB_SCHEMA.<a class="code" href="kmeans_8sql__in.html#a01e02736e6d156240b15f7d6dae092c3" title="k-Means Random Seeding">kmeans_random_seeding</a>( |
| <a name="l00861"></a>00861 rel_source VARCHAR, |
| <a name="l00862"></a>00862 expr_point VARCHAR, |
| <a name="l00863"></a>00863 k INTEGER |
| <a name="l00864"></a>00864 ) RETURNS DOUBLE PRECISION[][] |
| <a name="l00865"></a>00865 LANGUAGE sql AS $$ |
| <a name="l00866"></a>00866 SELECT MADLIB_SCHEMA.<a class="code" href="kmeans_8sql__in.html#a01e02736e6d156240b15f7d6dae092c3" title="k-Means Random Seeding">kmeans_random_seeding</a>($1, $2, $3, NULL) |
| <a name="l00867"></a>00867 $$; |
| <a name="l00868"></a>00868 <span class="comment"></span> |
| <a name="l00869"></a>00869 <span class="comment">/**</span> |
| <a name="l00870"></a>00870 <span class="comment"> * @brief Run k-Means with random seeding.</span> |
| <a name="l00871"></a>00871 <span class="comment"> *</span> |
| <a name="l00872"></a>00872 <span class="comment"> * This is a shortcut for running k-means with random seeding. It is equivalent</span> |
| <a name="l00873"></a>00873 <span class="comment"> * to</span> |
| <a name="l00874"></a>00874 <span class="comment"> * <pre>SELECT \ref kmeans(</span> |
| <a name="l00875"></a>00875 <span class="comment"> rel_source,</span> |
| <a name="l00876"></a>00876 <span class="comment"> expr_point,</span> |
| <a name="l00877"></a>00877 <span class="comment"> \ref kmeans_random_seeding(</span> |
| <a name="l00878"></a>00878 <span class="comment"> rel_source,</span> |
| <a name="l00879"></a>00879 <span class="comment"> expr_point,</span> |
| <a name="l00880"></a>00880 <span class="comment"> k</span> |
| <a name="l00881"></a>00881 <span class="comment"> ),</span> |
| <a name="l00882"></a>00882 <span class="comment"> fn_dist,</span> |
| <a name="l00883"></a>00883 <span class="comment"> agg_centroid,</span> |
| <a name="l00884"></a>00884 <span class="comment"> max_num_iterations,</span> |
| <a name="l00885"></a>00885 <span class="comment"> min_frac_reassigned</span> |
| <a name="l00886"></a>00886 <span class="comment">)</pre></span> |
| <a name="l00887"></a>00887 <span class="comment"> */</span> |
| <a name="l00888"></a>00888 CREATE FUNCTION MADLIB_SCHEMA.<a class="code" href="kmeans_8sql__in.html#aeec5efd06aca50f4830aa10d522dc5ed" title="Run k-Means with random seeding.">kmeans_random</a>( |
| <a name="l00889"></a>00889 rel_source VARCHAR, |
| <a name="l00890"></a>00890 expr_point VARCHAR, |
| <a name="l00891"></a>00891 k INTEGER, |
| <a name="l00892"></a>00892 fn_dist VARCHAR <span class="comment">/*+ DEFAULT 'squared_dist_norm2' */</span>, |
| <a name="l00893"></a>00893 agg_centroid VARCHAR <span class="comment">/*+ DEFAULT 'avg' */</span>, |
| <a name="l00894"></a>00894 max_num_iterations INTEGER <span class="comment">/*+ DEFAULT 20 */</span>, |
| <a name="l00895"></a>00895 min_frac_reassigned DOUBLE PRECISION <span class="comment">/*+ DEFAULT 0.001 */</span> |
| <a name="l00896"></a>00896 ) RETURNS MADLIB_SCHEMA.kmeans_result |
| <a name="l00897"></a>00897 VOLATILE |
| <a name="l00898"></a>00898 STRICT |
| <a name="l00899"></a>00899 LANGUAGE plpgsql |
| <a name="l00900"></a>00900 AS $$ |
| <a name="l00901"></a>00901 DECLARE |
| <a name="l00902"></a>00902 ret MADLIB_SCHEMA.kmeans_result; |
| <a name="l00903"></a>00903 BEGIN |
| <a name="l00904"></a>00904 ret = MADLIB_SCHEMA.kmeans( |
| <a name="l00905"></a>00905 $1, $2, MADLIB_SCHEMA.kmeans_random_seeding($1, $2, $3), |
| <a name="l00906"></a>00906 $4, $5, $6, $7); |
| <a name="l00907"></a>00907 RETURN ret; |
| <a name="l00908"></a>00908 END |
| <a name="l00909"></a>00909 $$; |
| <a name="l00910"></a><a class="code" href="kmeans_8sql__in.html#aeec5efd06aca50f4830aa10d522dc5ed">00910</a> |
| <a name="l00911"></a>00911 CREATE FUNCTION MADLIB_SCHEMA.<a class="code" href="kmeans_8sql__in.html#aeec5efd06aca50f4830aa10d522dc5ed" title="Run k-Means with random seeding.">kmeans_random</a>( |
| <a name="l00912"></a>00912 rel_source VARCHAR, |
| <a name="l00913"></a>00913 expr_point VARCHAR, |
| <a name="l00914"></a>00914 k INTEGER, |
| <a name="l00915"></a>00915 fn_dist VARCHAR, |
| <a name="l00916"></a>00916 agg_centroid VARCHAR, |
| <a name="l00917"></a>00917 max_num_iterations INTEGER |
| <a name="l00918"></a>00918 ) RETURNS MADLIB_SCHEMA.kmeans_result |
| <a name="l00919"></a>00919 VOLATILE |
| <a name="l00920"></a>00920 STRICT |
| <a name="l00921"></a>00921 LANGUAGE plpgsql |
| <a name="l00922"></a>00922 AS $$ |
| <a name="l00923"></a>00923 DECLARE |
| <a name="l00924"></a>00924 ret MADLIB_SCHEMA.kmeans_result; |
| <a name="l00925"></a>00925 BEGIN |
| <a name="l00926"></a>00926 ret = MADLIB_SCHEMA.kmeans( |
| <a name="l00927"></a>00927 $1, $2, MADLIB_SCHEMA.kmeans_random_seeding($1, $2, $3), |
| <a name="l00928"></a>00928 $4, $5, $6, 0.001); |
| <a name="l00929"></a>00929 RETURN ret; |
| <a name="l00930"></a>00930 END |
| <a name="l00931"></a>00931 $$; |
| <a name="l00932"></a>00932 |
| <a name="l00933"></a>00933 CREATE FUNCTION MADLIB_SCHEMA.<a class="code" href="kmeans_8sql__in.html#aeec5efd06aca50f4830aa10d522dc5ed" title="Run k-Means with random seeding.">kmeans_random</a>( |
| <a name="l00934"></a>00934 rel_source VARCHAR, |
| <a name="l00935"></a>00935 expr_point VARCHAR, |
| <a name="l00936"></a>00936 k INTEGER, |
| <a name="l00937"></a>00937 fn_dist VARCHAR, |
| <a name="l00938"></a>00938 agg_centroid VARCHAR |
| <a name="l00939"></a>00939 ) RETURNS MADLIB_SCHEMA.kmeans_result |
| <a name="l00940"></a>00940 VOLATILE |
| <a name="l00941"></a>00941 STRICT |
| <a name="l00942"></a>00942 LANGUAGE plpgsql |
| <a name="l00943"></a>00943 AS $$ |
| <a name="l00944"></a>00944 DECLARE |
| <a name="l00945"></a>00945 ret MADLIB_SCHEMA.kmeans_result; |
| <a name="l00946"></a>00946 BEGIN |
| <a name="l00947"></a>00947 ret = MADLIB_SCHEMA.kmeans( |
| <a name="l00948"></a>00948 $1, $2, MADLIB_SCHEMA.kmeans_random_seeding($1, $2, $3), |
| <a name="l00949"></a>00949 $4, $5, 20, 0.001); |
| <a name="l00950"></a>00950 RETURN ret; |
| <a name="l00951"></a>00951 END |
| <a name="l00952"></a>00952 $$; |
| <a name="l00953"></a>00953 |
| <a name="l00954"></a>00954 CREATE FUNCTION MADLIB_SCHEMA.<a class="code" href="kmeans_8sql__in.html#aeec5efd06aca50f4830aa10d522dc5ed" title="Run k-Means with random seeding.">kmeans_random</a>( |
| <a name="l00955"></a>00955 rel_source VARCHAR, |
| <a name="l00956"></a>00956 expr_point VARCHAR, |
| <a name="l00957"></a>00957 k INTEGER, |
| <a name="l00958"></a>00958 fn_dist VARCHAR |
| <a name="l00959"></a>00959 ) RETURNS MADLIB_SCHEMA.kmeans_result |
| <a name="l00960"></a>00960 VOLATILE |
| <a name="l00961"></a>00961 STRICT |
| <a name="l00962"></a>00962 LANGUAGE plpgsql |
| <a name="l00963"></a>00963 AS $$ |
| <a name="l00964"></a>00964 DECLARE |
| <a name="l00965"></a>00965 ret MADLIB_SCHEMA.kmeans_result; |
| <a name="l00966"></a>00966 BEGIN |
| <a name="l00967"></a>00967 ret = MADLIB_SCHEMA.kmeans( |
| <a name="l00968"></a>00968 $1, $2, |
| <a name="l00969"></a>00969 MADLIB_SCHEMA.kmeans_random_seeding($1, $2, $3), |
| <a name="l00970"></a>00970 $4, 'MADLIB_SCHEMA.avg', 20, 0.001); |
| <a name="l00971"></a>00971 RETURN ret; |
| <a name="l00972"></a>00972 END |
| <a name="l00973"></a>00973 $$; |
| <a name="l00974"></a>00974 |
| <a name="l00975"></a>00975 CREATE FUNCTION MADLIB_SCHEMA.<a class="code" href="kmeans_8sql__in.html#aeec5efd06aca50f4830aa10d522dc5ed" title="Run k-Means with random seeding.">kmeans_random</a>( |
| <a name="l00976"></a>00976 rel_source VARCHAR, |
| <a name="l00977"></a>00977 expr_point VARCHAR, |
| <a name="l00978"></a>00978 k INTEGER |
| <a name="l00979"></a>00979 ) RETURNS MADLIB_SCHEMA.kmeans_result |
| <a name="l00980"></a>00980 VOLATILE |
| <a name="l00981"></a>00981 STRICT |
| <a name="l00982"></a>00982 LANGUAGE plpgsql |
| <a name="l00983"></a>00983 AS $$ |
| <a name="l00984"></a>00984 DECLARE |
| <a name="l00985"></a>00985 ret MADLIB_SCHEMA.kmeans_result; |
| <a name="l00986"></a>00986 BEGIN |
| <a name="l00987"></a>00987 ret = MADLIB_SCHEMA.kmeans( |
| <a name="l00988"></a>00988 $1, $2, |
| <a name="l00989"></a>00989 MADLIB_SCHEMA.kmeans_random_seeding($1, $2, $3), |
| <a name="l00990"></a>00990 'MADLIB_SCHEMA.squared_dist_norm2', 'MADLIB_SCHEMA.avg', 20, 0.001); |
| <a name="l00991"></a>00991 RETURN ret; |
| <a name="l00992"></a>00992 END |
| <a name="l00993"></a>00993 $$; |
| <a name="l00994"></a>00994 <span class="comment"></span> |
| <a name="l00995"></a>00995 <span class="comment">/**</span> |
| <a name="l00996"></a>00996 <span class="comment"> * @internal</span> |
| <a name="l00997"></a>00997 <span class="comment"> * @brief Execute a SQL command where $1, ..., $6 are substituted with the</span> |
| <a name="l00998"></a>00998 <span class="comment"> * given arguments.</span> |
| <a name="l00999"></a>00999 <span class="comment"> */</span> |
| <a name="l01000"></a>01000 CREATE FUNCTION MADLIB_SCHEMA.internal_execute_using_kmeans_args( |
| <a name="l01001"></a>01001 sql VARCHAR, rel_source VARCHAR, expr_point VARCHAR, |
| <a name="l01002"></a>01002 fn_dist VARCHAR, agg_centroid VARCHAR, max_num_iterations INTEGER, |
| <a name="l01003"></a>01003 min_frac_reassigned DOUBLE PRECISION |
| <a name="l01004"></a>01004 ) RETURNS MADLIB_SCHEMA.kmeans_result |
| <a name="l01005"></a>01005 VOLATILE |
| <a name="l01006"></a>01006 CALLED ON NULL INPUT |
| <a name="l01007"></a>01007 LANGUAGE c |
| <a name="l01008"></a>01008 AS 'MODULE_PATHNAME', 'exec_sql_using'; |
| <a name="l01009"></a>01009 <span class="comment"></span> |
| <a name="l01010"></a>01010 <span class="comment">/**</span> |
| <a name="l01011"></a>01011 <span class="comment"> * @internal</span> |
| <a name="l01012"></a>01012 <span class="comment"> * @brief Filter out the invalid data points in the original input relation</span> |
| <a name="l01013"></a>01013 <span class="comment"> */</span> |
| <a name="l01014"></a>01014 CREATE FUNCTION MADLIB_SCHEMA.__filter_input_relation( |
| <a name="l01015"></a>01015 rel_source VARCHAR, expr_point VARCHAR) |
| <a name="l01016"></a>01016 RETURNS VARCHAR |
| <a name="l01017"></a>01017 AS $$ |
| <a name="l01018"></a>01018 DECLARE |
| <a name="l01019"></a>01019 oldClientMinMessages VARCHAR; |
| <a name="l01020"></a>01020 rel_source_filtered VARCHAR; |
| <a name="l01021"></a>01021 BEGIN |
| <a name="l01022"></a>01022 IF (SELECT position('.' in rel_source)) > 0 THEN |
| <a name="l01023"></a>01023 rel_source_filtered := '_madlib_' || split_part(rel_source, '.', 2) || '_filtered'; |
| <a name="l01024"></a>01024 ELSE |
| <a name="l01025"></a>01025 rel_source_filtered := '_madlib_' || rel_source || '_filtered'; |
| <a name="l01026"></a>01026 END IF; |
| <a name="l01027"></a>01027 |
| <a name="l01028"></a>01028 oldClientMinMessages := |
| <a name="l01029"></a>01029 (SELECT setting FROM pg_settings WHERE name = 'client_min_messages'); |
| <a name="l01030"></a>01030 EXECUTE 'SET client_min_messages TO warning'; |
| <a name="l01031"></a>01031 EXECUTE 'DROP VIEW IF EXISTS _madlib_'||rel_source_filtered||'_filtered'; |
| <a name="l01032"></a>01032 EXECUTE 'DROP VIEW IF EXISTS '||rel_source_filtered; |
| <a name="l01033"></a>01033 EXECUTE 'CREATE TEMP VIEW '||rel_source_filtered||' |
| <a name="l01034"></a>01034 AS SELECT * FROM '||rel_source||' |
| <a name="l01035"></a>01035 WHERE abs( |
| <a name="l01036"></a>01036 coalesce( |
| <a name="l01037"></a>01037 MADLIB_SCHEMA.<a class="code" href="svec_8sql__in.html#a00a7b3260b9fde9b55061e6bf58a028a">svec_elsum</a>('||expr_point||'), |
| <a name="l01038"></a>01038 ''Infinity''::FLOAT8 |
| <a name="l01039"></a>01039 ) |
| <a name="l01040"></a>01040 ) < ''Infinity''::FLOAT8'; |
| <a name="l01041"></a>01041 EXECUTE 'SET client_min_messages TO ' || oldClientMinMessages; |
| <a name="l01042"></a>01042 RETURN rel_source_filtered; |
| <a name="l01043"></a>01043 EXCEPTION |
| <a name="l01044"></a>01044 WHEN undefined_function THEN |
| <a name="l01045"></a>01045 RAISE EXCEPTION 'Point coordinates (%) are not a valid type |
| <a name="l01046"></a>01046 (SVEC, FLOAT[], or INTEGER[]).', expr_point; |
| <a name="l01047"></a>01047 END |
| <a name="l01048"></a>01048 $$ |
| <a name="l01049"></a>01049 VOLATILE STRICT |
| <a name="l01050"></a>01050 LANGUAGE PLPGSQL; |
| <a name="l01051"></a>01051 <span class="comment"></span> |
| <a name="l01052"></a>01052 <span class="comment">/**</span> |
| <a name="l01053"></a>01053 <span class="comment"> * @brief Perform Lloyd's k-means local-search heuristic, but with initial</span> |
| <a name="l01054"></a>01054 <span class="comment"> * centroids stored in a table</span> |
| <a name="l01055"></a>01055 <span class="comment"> *</span> |
| <a name="l01056"></a>01056 <span class="comment"> * This is a shortcut for running k-means with initial centroids stored in a</span> |
| <a name="l01057"></a>01057 <span class="comment"> * table (as opposed to an array of centroids). It is equivalent</span> |
| <a name="l01058"></a>01058 <span class="comment"> * to</span> |
| <a name="l01059"></a>01059 <span class="comment"> * <pre>SELECT \ref kmeans(</span> |
| <a name="l01060"></a>01060 <span class="comment"> rel_source,</span> |
| <a name="l01061"></a>01061 <span class="comment"> expr_point,</span> |
| <a name="l01062"></a>01062 <span class="comment"> (SELECT \ref matrix_agg($expr_centroid) FROM $rel_initial_centroids),</span> |
| <a name="l01063"></a>01063 <span class="comment"> fn_dist,</span> |
| <a name="l01064"></a>01064 <span class="comment"> agg_centroid,</span> |
| <a name="l01065"></a>01065 <span class="comment"> max_num_iterations,</span> |
| <a name="l01066"></a>01066 <span class="comment"> min_frac_reassigned</span> |
| <a name="l01067"></a>01067 <span class="comment">)</pre></span> |
| <a name="l01068"></a>01068 <span class="comment"> * where <tt>$expr_centroid</tt> and <tt>$rel_initial_centroids</tt> denote</span> |
| <a name="l01069"></a>01069 <span class="comment"> * textual substituions.</span> |
| <a name="l01070"></a>01070 <span class="comment"> */</span> |
| <a name="l01071"></a>01071 CREATE FUNCTION MADLIB_SCHEMA.kmeans( |
| <a name="l01072"></a>01072 rel_source VARCHAR, |
| <a name="l01073"></a>01073 expr_point VARCHAR, |
| <a name="l01074"></a>01074 rel_initial_centroids VARCHAR, |
| <a name="l01075"></a>01075 expr_centroid VARCHAR, |
| <a name="l01076"></a>01076 fn_dist VARCHAR <span class="comment">/*+ DEFAULT 'squared_dist_norm2' */</span>, |
| <a name="l01077"></a>01077 agg_centroid VARCHAR <span class="comment">/*+ DEFAULT 'avg' */</span>, |
| <a name="l01078"></a>01078 max_num_iterations INTEGER <span class="comment">/*+ DEFAULT 20 */</span>, |
| <a name="l01079"></a>01079 min_frac_reassigned DOUBLE PRECISION <span class="comment">/*+ DEFAULT 0.001 */</span> |
| <a name="l01080"></a>01080 ) RETURNS MADLIB_SCHEMA.kmeans_result |
| <a name="l01081"></a>01081 VOLATILE |
| <a name="l01082"></a>01082 STRICT |
| <a name="l01083"></a>01083 LANGUAGE plpgsql |
| <a name="l01084"></a>01084 AS $$ |
| <a name="l01085"></a>01085 DECLARE |
| <a name="l01086"></a>01086 class_rel_initial_centroids REGCLASS; |
| <a name="l01087"></a>01087 theResult MADLIB_SCHEMA.kmeans_result; |
| <a name="l01088"></a>01088 BEGIN |
| <a name="l01089"></a>01089 class_rel_initial_centroids := rel_initial_centroids; |
| <a name="l01090"></a>01090 SELECT * FROM MADLIB_SCHEMA.internal_execute_using_kmeans_args($sql$ |
| <a name="l01091"></a>01091 SELECT MADLIB_SCHEMA.kmeans( |
| <a name="l01092"></a>01092 $1, $2, |
| <a name="l01093"></a><a class="code" href="kmeans_8sql__in.html#a6e1a47f006bc0576f56eabcd6903086f">01093</a> ( |
| <a name="l01094"></a>01094 SELECT MADLIB_SCHEMA.<a class="code" href="linalg_8sql__in.html#a9c439706f35d6cac89f151d553a5f111" title="Combine vectors to a matrix.">matrix_agg</a>(($sql$ || expr_centroid || $sql$)::FLOAT8[]) |
| <a name="l01095"></a>01095 FROM $sql$ || textin(regclassout(class_rel_initial_centroids)) |
| <a name="l01096"></a>01096 || $sql$ |
| <a name="l01097"></a>01097 ), |
| <a name="l01098"></a>01098 $3, $4, $5, $6) |
| <a name="l01099"></a>01099 $sql$, |
| <a name="l01100"></a>01100 rel_source, expr_point, |
| <a name="l01101"></a>01101 fn_dist, agg_centroid, max_num_iterations, min_frac_reassigned) |
| <a name="l01102"></a>01102 INTO theResult; |
| <a name="l01103"></a>01103 RETURN theResult; |
| <a name="l01104"></a>01104 END; |
| <a name="l01105"></a>01105 $$; |
| <a name="l01106"></a>01106 |
| <a name="l01107"></a>01107 CREATE FUNCTION MADLIB_SCHEMA.kmeans( |
| <a name="l01108"></a>01108 rel_source VARCHAR, |
| <a name="l01109"></a>01109 expr_point VARCHAR, |
| <a name="l01110"></a>01110 rel_initial_centroids VARCHAR, |
| <a name="l01111"></a>01111 expr_centroid VARCHAR, |
| <a name="l01112"></a>01112 fn_dist VARCHAR, |
| <a name="l01113"></a>01113 agg_centroid VARCHAR, |
| <a name="l01114"></a>01114 max_num_iterations INTEGER |
| <a name="l01115"></a>01115 ) RETURNS MADLIB_SCHEMA.kmeans_result |
| <a name="l01116"></a>01116 VOLATILE |
| <a name="l01117"></a>01117 STRICT |
| <a name="l01118"></a>01118 LANGUAGE sql AS $$ |
| <a name="l01119"></a>01119 SELECT MADLIB_SCHEMA.kmeans( |
| <a name="l01120"></a>01120 $1, $2, |
| <a name="l01121"></a>01121 $3, $4, $5, $6, $7, 0.001) |
| <a name="l01122"></a>01122 $$; |
| <a name="l01123"></a>01123 |
| <a name="l01124"></a>01124 CREATE FUNCTION MADLIB_SCHEMA.kmeans( |
| <a name="l01125"></a>01125 rel_source VARCHAR, |
| <a name="l01126"></a>01126 expr_point VARCHAR, |
| <a name="l01127"></a>01127 rel_initial_centroids VARCHAR, |
| <a name="l01128"></a>01128 expr_centroid VARCHAR, |
| <a name="l01129"></a>01129 fn_dist VARCHAR, |
| <a name="l01130"></a>01130 agg_centroid VARCHAR |
| <a name="l01131"></a>01131 ) RETURNS MADLIB_SCHEMA.kmeans_result |
| <a name="l01132"></a>01132 VOLATILE |
| <a name="l01133"></a>01133 STRICT |
| <a name="l01134"></a>01134 LANGUAGE sql AS $$ |
| <a name="l01135"></a>01135 SELECT MADLIB_SCHEMA.kmeans( |
| <a name="l01136"></a>01136 $1, $2, |
| <a name="l01137"></a>01137 $3, $4, $5, $6, 20, 0.001) |
| <a name="l01138"></a>01138 $$; |
| <a name="l01139"></a>01139 |
| <a name="l01140"></a>01140 CREATE FUNCTION MADLIB_SCHEMA.kmeans( |
| <a name="l01141"></a>01141 rel_source VARCHAR, |
| <a name="l01142"></a>01142 expr_point VARCHAR, |
| <a name="l01143"></a>01143 rel_initial_centroids VARCHAR, |
| <a name="l01144"></a>01144 expr_centroid VARCHAR, |
| <a name="l01145"></a>01145 fn_dist VARCHAR |
| <a name="l01146"></a>01146 ) RETURNS MADLIB_SCHEMA.kmeans_result |
| <a name="l01147"></a>01147 VOLATILE |
| <a name="l01148"></a>01148 STRICT |
| <a name="l01149"></a>01149 LANGUAGE sql AS $$ |
| <a name="l01150"></a>01150 SELECT MADLIB_SCHEMA.kmeans( |
| <a name="l01151"></a>01151 $1, $2, |
| <a name="l01152"></a>01152 $3, $4, $5, 'MADLIB_SCHEMA.avg', 20, 0.001) |
| <a name="l01153"></a>01153 $$; |
| <a name="l01154"></a>01154 |
| <a name="l01155"></a>01155 CREATE FUNCTION MADLIB_SCHEMA.kmeans( |
| <a name="l01156"></a>01156 rel_source VARCHAR, |
| <a name="l01157"></a>01157 expr_point VARCHAR, |
| <a name="l01158"></a>01158 rel_initial_centroids VARCHAR, |
| <a name="l01159"></a>01159 expr_centroid VARCHAR |
| <a name="l01160"></a>01160 ) RETURNS MADLIB_SCHEMA.kmeans_result |
| <a name="l01161"></a>01161 VOLATILE |
| <a name="l01162"></a>01162 STRICT |
| <a name="l01163"></a>01163 LANGUAGE sql AS $$ |
| <a name="l01164"></a>01164 SELECT MADLIB_SCHEMA.kmeans( |
| <a name="l01165"></a>01165 $1, $2, |
| <a name="l01166"></a>01166 $3, $4, |
| <a name="l01167"></a>01167 'MADLIB_SCHEMA.squared_dist_norm2', 'MADLIB_SCHEMA.avg', 20, 0.001) |
| <a name="l01168"></a>01168 $$; |
| <a name="l01169"></a>01169 |
| <a name="l01170"></a>01170 <span class="comment"></span> |
| <a name="l01171"></a>01171 <span class="comment">/**</span> |
| <a name="l01172"></a>01172 <span class="comment"> * @internal</span> |
| <a name="l01173"></a>01173 <span class="comment"> * @brief Execute a SQL command where $1, ..., $3 are substituted with the</span> |
| <a name="l01174"></a>01174 <span class="comment"> * given arguments.</span> |
| <a name="l01175"></a>01175 <span class="comment"> */</span> |
| <a name="l01176"></a>01176 CREATE FUNCTION MADLIB_SCHEMA.internal_execute_using_silhouette_args( |
| <a name="l01177"></a>01177 sql VARCHAR, centroids DOUBLE PRECISION[][], fn_dist REGPROC |
| <a name="l01178"></a>01178 ) RETURNS DOUBLE PRECISION |
| <a name="l01179"></a>01179 STABLE |
| <a name="l01180"></a>01180 CALLED ON NULL INPUT |
| <a name="l01181"></a>01181 LANGUAGE c |
| <a name="l01182"></a>01182 AS 'MODULE_PATHNAME', 'exec_sql_using'; |
| <a name="l01183"></a>01183 <span class="comment"></span> |
| <a name="l01184"></a>01184 <span class="comment">/**</span> |
| <a name="l01185"></a>01185 <span class="comment"> * @brief Compute a simplified version of the silhouette coefficient</span> |
| <a name="l01186"></a>01186 <span class="comment"> *</span> |
| <a name="l01187"></a>01187 <span class="comment"> * @param rel_source Name of the relation containing input points</span> |
| <a name="l01188"></a>01188 <span class="comment"> * @param expr_point Expression evaluating to point coordinates \f$ x_i \f$ for</span> |
| <a name="l01189"></a>01189 <span class="comment"> * each tuple</span> |
| <a name="l01190"></a>01190 <span class="comment"> * @param centroids Matrix \f$ M = (\vec{m_0} \dots \vec{m_{k-1}})</span> |
| <a name="l01191"></a>01191 <span class="comment"> * \in \mathbb{R}^{d \times k} \f$ with \f$ k \f$ columns, where column</span> |
| <a name="l01192"></a>01192 <span class="comment"> * \f$ i \f$ contains the position of centroid \f$ i \f$.</span> |
| <a name="l01193"></a>01193 <span class="comment"> * @param fn_dist Name of a function with signature</span> |
| <a name="l01194"></a>01194 <span class="comment"> * <tt>DOUBLE PRECISION[] x DOUBLE PRECISION[] -> DOUBLE PRECISION</tt> that</span> |
| <a name="l01195"></a>01195 <span class="comment"> * returns the distance between two points</span> |
| <a name="l01196"></a>01196 <span class="comment"> * @return For each point \f$ x_i \f$, let</span> |
| <a name="l01197"></a>01197 <span class="comment"> * \f$ d_1( x_i ) \f$ and \f$ d_2( x_i ) \f$ be the distance to the closest</span> |
| <a name="l01198"></a>01198 <span class="comment"> * and 2nd-closest centroid, respectively. If there is more than one</span> |
| <a name="l01199"></a>01199 <span class="comment"> * closest centroids then \f$ d_1( x_i ) = d_2( x_i )\f$.</span> |
| <a name="l01200"></a>01200 <span class="comment"> * The return value is the average, over all points \f$ x_i \f$, of</span> |
| <a name="l01201"></a>01201 <span class="comment"> * \f[</span> |
| <a name="l01202"></a>01202 <span class="comment"> * \frac{d_2( x_i ) - d_1(x_i)}{d_2(x_i)},</span> |
| <a name="l01203"></a>01203 <span class="comment"> * \f]</span> |
| <a name="l01204"></a>01204 <span class="comment"> * where 0/0 is interpreted as 0.</span> |
| <a name="l01205"></a>01205 <span class="comment"> * Clearly, the simplified silhouette coefficient assumes values in</span> |
| <a name="l01206"></a>01206 <span class="comment"> * \f$ [0,1] \f$.</span> |
| <a name="l01207"></a>01207 <span class="comment"> */</span> |
| <a name="l01208"></a>01208 CREATE FUNCTION MADLIB_SCHEMA.<a class="code" href="kmeans_8sql__in.html#a71e7675758c99acbe7785819b6a85a8f" title="Compute a simplified version of the silhouette coefficient.">simple_silhouette</a>( |
| <a name="l01209"></a>01209 rel_source VARCHAR, |
| <a name="l01210"></a>01210 expr_point VARCHAR, |
| <a name="l01211"></a>01211 centroids DOUBLE PRECISION[][], |
| <a name="l01212"></a>01212 fn_dist VARCHAR <span class="comment">/*+ DEFAULT 'dist_norm2' */</span> |
| <a name="l01213"></a>01213 ) RETURNS DOUBLE PRECISION |
| <a name="l01214"></a>01214 STABLE |
| <a name="l01215"></a>01215 STRICT |
| <a name="l01216"></a>01216 LANGUAGE plpgsql |
| <a name="l01217"></a>01217 AS $$ |
| <a name="l01218"></a>01218 DECLARE |
| <a name="l01219"></a>01219 class_rel_source REGCLASS; |
| <a name="l01220"></a>01220 proc_fn_dist REGPROCEDURE; |
| <a name="l01221"></a>01221 rel_filtered VARCHAR; |
| <a name="l01222"></a>01222 BEGIN |
| <a name="l01223"></a>01223 IF (array_upper(centroids,1) IS NULL) THEN |
| <a name="l01224"></a>01224 RAISE EXCEPTION 'No valid centroids given.'; |
| <a name="l01225"></a>01225 END IF; |
| <a name="l01226"></a>01226 |
| <a name="l01227"></a>01227 rel_filtered = MADLIB_SCHEMA.__filter_input_relation(rel_source, expr_point); |
| <a name="l01228"></a>01228 class_rel_source := rel_filtered; |
| <a name="l01229"></a>01229 proc_fn_dist := fn_dist |
| <a name="l01230"></a><a class="code" href="kmeans_8sql__in.html#a71e7675758c99acbe7785819b6a85a8f">01230</a> || '(DOUBLE PRECISION[], DOUBLE PRECISION[])'; |
| <a name="l01231"></a>01231 IF (SELECT prorettype != 'DOUBLE PRECISION'::regtype OR proisagg = TRUE |
| <a name="l01232"></a>01232 FROM pg_proc WHERE oid = proc_fn_dist) THEN |
| <a name="l01233"></a>01233 RAISE EXCEPTION 'Distance function has wrong signature or is not a simple function.'; |
| <a name="l01234"></a>01234 END IF; |
| <a name="l01235"></a>01235 |
| <a name="l01236"></a>01236 RETURN MADLIB_SCHEMA.internal_execute_using_silhouette_args($sql$ |
| <a name="l01237"></a>01237 SELECT |
| <a name="l01238"></a>01238 avg(CASE |
| <a name="l01239"></a>01239 WHEN distances[2] = 0 THEN 0 |
| <a name="l01240"></a>01240 ELSE (distances[2] - distances[1]) / distances[2] |
| <a name="l01241"></a>01241 END) |
| <a name="l01242"></a>01242 FROM ( |
| <a name="l01243"></a>01243 SELECT |
| <a name="l01244"></a>01244 (MADLIB_SCHEMA.<a class="code" href="linalg_8sql__in.html#ad3e16fc5435474b96b182ba20905461e" title="Given matrix and vector compute the columns of that are closest to .">closest_columns</a>( |
| <a name="l01245"></a>01245 $1, |
| <a name="l01246"></a>01246 ($sql$ || expr_point || $sql$)::FLOAT8[], |
| <a name="l01247"></a>01247 2::INTEGER, |
| <a name="l01248"></a>01248 $2 |
| <a name="l01249"></a>01249 )).distances |
| <a name="l01250"></a>01250 FROM |
| <a name="l01251"></a>01251 $sql$ || textin(regclassout(class_rel_source)) || $sql$ |
| <a name="l01252"></a>01252 ) AS two_shortest_distances |
| <a name="l01253"></a>01253 $sql$, |
| <a name="l01254"></a>01254 centroids, proc_fn_dist); |
| <a name="l01255"></a>01255 END; |
| <a name="l01256"></a>01256 $$; |
| <a name="l01257"></a>01257 |
| <a name="l01258"></a>01258 CREATE FUNCTION MADLIB_SCHEMA.<a class="code" href="kmeans_8sql__in.html#a71e7675758c99acbe7785819b6a85a8f" title="Compute a simplified version of the silhouette coefficient.">simple_silhouette</a>( |
| <a name="l01259"></a>01259 rel_source VARCHAR, |
| <a name="l01260"></a>01260 expr_point VARCHAR, |
| <a name="l01261"></a>01261 centroids DOUBLE PRECISION[][] |
| <a name="l01262"></a>01262 ) RETURNS DOUBLE PRECISION |
| <a name="l01263"></a>01263 STABLE |
| <a name="l01264"></a>01264 STRICT |
| <a name="l01265"></a>01265 LANGUAGE sql |
| <a name="l01266"></a>01266 AS $$ |
| <a name="l01267"></a>01267 SELECT MADLIB_SCHEMA.<a class="code" href="kmeans_8sql__in.html#a71e7675758c99acbe7785819b6a85a8f" title="Compute a simplified version of the silhouette coefficient.">simple_silhouette</a>($1, $2, $3, |
| <a name="l01268"></a>01268 'MADLIB_SCHEMA.<a class="code" href="linalg_8sql__in.html#aa58e51526edea6ea98db30b6f250adb4" title="2-norm of the difference between two vectors">dist_norm2</a>') |
| <a name="l01269"></a>01269 $$; |
| </pre></div></div> |
| </div> |
| <div id="nav-path" class="navpath"> |
| <ul> |
| <li class="navelem"><a class="el" href="kmeans_8sql__in.html">kmeans.sql_in</a> </li> |
| <!-- window showing the filter options --> |
| <div id="MSearchSelectWindow" |
| onmouseover="return searchBox.OnSearchSelectShow()" |
| onmouseout="return searchBox.OnSearchSelectHide()" |
| onkeydown="return searchBox.OnSearchSelectKey(event)"> |
| <a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(0)"><span class="SelectionMark"> </span>All</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(1)"><span class="SelectionMark"> </span>Files</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(2)"><span class="SelectionMark"> </span>Functions</a></div> |
| |
| <!-- iframe showing the search results (closed by default) --> |
| <div id="MSearchResultsWindow"> |
| <iframe src="javascript:void(0)" frameborder="0" |
| name="MSearchResults" id="MSearchResults"> |
| </iframe> |
| </div> |
| |
| |
| <li class="footer">Generated on Fri May 10 2013 01:37:13 for MADlib by |
| <a href="http://www.doxygen.org/index.html"> |
| <img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.7.5.1 </li> |
| </ul> |
| </div> |
| |
| |
| </body> |
| </html> |