blob: 9dcbbb32288aaa0a0f4c5b49f1021836f657805d [file] [log] [blame]
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
<title>MADlib: kmeans.sql_in Source File</title>
<link href="tabs.css" rel="stylesheet" type="text/css"/>
<link href="doxygen.css" rel="stylesheet" type="text/css" />
<link href="navtree.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript" src="jquery.js"></script>
<script type="text/javascript" src="resize.js"></script>
<script type="text/javascript" src="navtree.js"></script>
<script type="text/javascript">
$(document).ready(initResizable);
</script>
<link href="search/search.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript" src="search/search.js"></script>
<script type="text/javascript">
$(document).ready(function() { searchBox.OnSelectItem(0); });
</script>
<script src="../mathjax/MathJax.js">
MathJax.Hub.Config({
extensions: ["tex2jax.js", "TeX/AMSmath.js", "TeX/AMSsymbols.js"],
jax: ["input/TeX","output/HTML-CSS"],
});
</script>
</head>
<body>
<div id="top"><!-- do not remove this div! -->
<div id="titlearea">
<table cellspacing="0" cellpadding="0">
<tbody>
<tr style="height: 56px;">
<td style="padding-left: 0.5em;">
<div id="projectname">MADlib
&#160;<span id="projectnumber">0.7</span> <span style="font-size:10pt; font-style:italic"><a href="../latest/./kmeans_8sql__in_source.html"> A newer version is available</a></span>
</div>
<div id="projectbrief">User Documentation</div>
</td>
</tr>
</tbody>
</table>
</div>
<!-- Generated by Doxygen 1.7.5.1 -->
<script type="text/javascript">
var searchBox = new SearchBox("searchBox", "search",false,'Search');
</script>
<script type="text/javascript" src="dynsections.js"></script>
<div id="navrow1" class="tabs">
<ul class="tablist">
<li><a href="index.html"><span>Main&#160;Page</span></a></li>
<li><a href="modules.html"><span>Modules</span></a></li>
<li class="current"><a href="files.html"><span>Files</span></a></li>
<li>
<div id="MSearchBox" class="MSearchBoxInactive">
<span class="left">
<img id="MSearchSelect" src="search/mag_sel.png"
onmouseover="return searchBox.OnSearchSelectShow()"
onmouseout="return searchBox.OnSearchSelectHide()"
alt=""/>
<input type="text" id="MSearchField" value="Search" accesskey="S"
onfocus="searchBox.OnSearchFieldFocus(true)"
onblur="searchBox.OnSearchFieldFocus(false)"
onkeyup="searchBox.OnSearchFieldChange(event)"/>
</span><span class="right">
<a id="MSearchClose" href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" border="0" src="search/close.png" alt=""/></a>
</span>
</div>
</li>
</ul>
</div>
<div id="navrow2" class="tabs2">
<ul class="tablist">
<li><a href="files.html"><span>File&#160;List</span></a></li>
<li><a href="globals.html"><span>File&#160;Members</span></a></li>
</ul>
</div>
</div>
<div id="side-nav" class="ui-resizable side-nav-resizable">
<div id="nav-tree">
<div id="nav-tree-contents">
</div>
</div>
<div id="splitbar" style="-moz-user-select:none;"
class="ui-resizable-handle">
</div>
</div>
<script type="text/javascript">
initNavTree('kmeans_8sql__in.html','');
</script>
<div id="doc-content">
<div class="header">
<div class="headertitle">
<div class="title">kmeans.sql_in</div> </div>
</div>
<div class="contents">
<a href="kmeans_8sql__in.html">Go to the documentation of this file.</a><div class="fragment"><pre class="fragment"><a name="l00001"></a>00001 <span class="comment">/* ----------------------------------------------------------------------- */</span><span class="comment">/**</span>
<a name="l00002"></a>00002 <span class="comment"> *</span>
<a name="l00003"></a>00003 <span class="comment"> * @file kmeans.sql_in</span>
<a name="l00004"></a>00004 <span class="comment"> *</span>
<a name="l00005"></a>00005 <span class="comment"> * @brief Set of functions for k-means clustering.</span>
<a name="l00006"></a>00006 <span class="comment"> *</span>
<a name="l00007"></a>00007 <span class="comment"> * @sa For a brief introduction to k-means clustering, see the module</span>
<a name="l00008"></a>00008 <span class="comment"> * description \ref grp_kmeans.</span>
<a name="l00009"></a>00009 <span class="comment"> *</span>
<a name="l00010"></a>00010 <span class="comment"> */</span><span class="comment">/* ----------------------------------------------------------------------- */</span>
<a name="l00011"></a>00011
<a name="l00012"></a>00012 m4_include(`SQLCommon.m4<span class="stringliteral">&#39;)</span>
<a name="l00013"></a>00013 <span class="stringliteral"></span><span class="comment"></span>
<a name="l00014"></a>00014 <span class="comment">/**</span>
<a name="l00015"></a>00015 <span class="comment">@addtogroup grp_kmeans</span>
<a name="l00016"></a>00016 <span class="comment"></span>
<a name="l00017"></a>00017 <span class="comment">@about</span>
<a name="l00018"></a>00018 <span class="comment"></span>
<a name="l00019"></a>00019 <span class="comment">Clustering refers to the problem of partitioning a set of objects according to</span>
<a name="l00020"></a>00020 <span class="comment">some problem-dependent measure of &lt;em&gt;similarity&lt;/em&gt;. In the k-means variant,</span>
<a name="l00021"></a>00021 <span class="comment">one is given \f$ n \f$ points \f$ x_1, \dots, x_n \in \mathbb R^d \f$, and the</span>
<a name="l00022"></a>00022 <span class="comment">goal is to position \f$ k \f$ centroids \f$ c_1, \dots, c_k \in \mathbb R^d \f$</span>
<a name="l00023"></a>00023 <span class="comment">so that the sum of \em distances between each point and its closest centroid is</span>
<a name="l00024"></a>00024 <span class="comment">minimized. Each centroid represents a cluster that consists of all points to</span>
<a name="l00025"></a>00025 <span class="comment">which this centroid is closest. Formally, we wish to minimize the following</span>
<a name="l00026"></a>00026 <span class="comment">objective function:</span>
<a name="l00027"></a>00027 <span class="comment">\f[</span>
<a name="l00028"></a>00028 <span class="comment"> (c_1, \dots, c_k) \mapsto \sum_{i=1}^n \min_{j=1}^k \operatorname{dist}(x_i, c_j)</span>
<a name="l00029"></a>00029 <span class="comment">\f]</span>
<a name="l00030"></a>00030 <span class="comment">In the most common case, \f$ \operatorname{dist} \f$ is the square of the</span>
<a name="l00031"></a>00031 <span class="comment">Euclidean distance.</span>
<a name="l00032"></a>00032 <span class="comment"></span>
<a name="l00033"></a>00033 <span class="comment">This problem is computationally difficult (NP-hard), yet the</span>
<a name="l00034"></a>00034 <span class="comment">local-search heuristic proposed by Lloyd [4] performs reasonably well in</span>
<a name="l00035"></a>00035 <span class="comment">practice. In fact, it is so ubiquitous today that it is</span>
<a name="l00036"></a>00036 <span class="comment">often referred to as the &lt;em&gt;standard algorithm&lt;/em&gt; or even just the</span>
<a name="l00037"></a>00037 <span class="comment">&lt;em&gt;k-means algorithm&lt;/em&gt; [1]. It works as follows:</span>
<a name="l00038"></a>00038 <span class="comment"></span>
<a name="l00039"></a>00039 <span class="comment">-# Seed the \f$ k \f$ centroids (see below)</span>
<a name="l00040"></a>00040 <span class="comment">-# Repeat until convergence:</span>
<a name="l00041"></a>00041 <span class="comment"> -# Assign each point to its closest centroid</span>
<a name="l00042"></a>00042 <span class="comment"> -# Move each centroid to a position that minimizes the sum of distances in this</span>
<a name="l00043"></a>00043 <span class="comment"> cluster</span>
<a name="l00044"></a>00044 <span class="comment">-# Convergence is achieved when no points change their assignments during step</span>
<a name="l00045"></a>00045 <span class="comment"> 2a.</span>
<a name="l00046"></a>00046 <span class="comment"></span>
<a name="l00047"></a>00047 <span class="comment">Since the objective function decreases in every step, this algorithm is</span>
<a name="l00048"></a>00048 <span class="comment">guaranteed to converge to a local optimum.</span>
<a name="l00049"></a>00049 <span class="comment"></span>
<a name="l00050"></a>00050 <span class="comment">@implementation</span>
<a name="l00051"></a>00051 <span class="comment"></span>
<a name="l00052"></a>00052 <span class="comment">Data points and predefined centroids (if used) are expected to be stored row-wise,</span>
<a name="l00053"></a>00053 <span class="comment">in a column of type &lt;tt&gt;\ref grp_svec &quot;SVEC&quot;&lt;/tt&gt; (or any type convertible to</span>
<a name="l00054"></a>00054 <span class="comment">&lt;tt&gt;\ref grp_svec &quot;SVEC&quot;&lt;/tt&gt;, like &lt;tt&gt;FLOAT[]&lt;/tt&gt; or &lt;tt&gt;INTEGER[]&lt;/tt&gt;).</span>
<a name="l00055"></a>00055 <span class="comment">Data points with non-finite values (NULL, NaN, infinity) in any component</span>
<a name="l00056"></a>00056 <span class="comment">will be skipped during analysis.</span>
<a name="l00057"></a>00057 <span class="comment"></span>
<a name="l00058"></a>00058 <span class="comment">The following methods are available for the centroid seeding:</span>
<a name="l00059"></a>00059 <span class="comment"> - &lt;strong&gt;random selection&lt;/strong&gt;:</span>
<a name="l00060"></a>00060 <span class="comment"> Select \f$ k \f$ centroids randomly among the input points.</span>
<a name="l00061"></a>00061 <span class="comment"> - &lt;strong&gt;kmeans++&lt;/strong&gt; [2]:</span>
<a name="l00062"></a>00062 <span class="comment"> Start with a single centroid chosen randomly among the input points. Then</span>
<a name="l00063"></a>00063 <span class="comment"> iteratively choose new</span>
<a name="l00064"></a>00064 <span class="comment"> centroids from the input points until there is a total of \f$ k \f$</span>
<a name="l00065"></a>00065 <span class="comment"> centroids. The probability for picking a particular point is proportional to</span>
<a name="l00066"></a>00066 <span class="comment"> its minimum distance to any existing centroid.</span>
<a name="l00067"></a>00067 <span class="comment"> \n</span>
<a name="l00068"></a>00068 <span class="comment"> Intuitively, kmeans++ favors seedings where centroids are spread out over the</span>
<a name="l00069"></a>00069 <span class="comment"> whole range of the input points, while at the same time not being too</span>
<a name="l00070"></a>00070 <span class="comment"> susceptible to outliers [2].</span>
<a name="l00071"></a>00071 <span class="comment"> - &lt;strong&gt;user-specified set of initial centroids&lt;/strong&gt;:</span>
<a name="l00072"></a>00072 <span class="comment"> See below for a description of the expected format of the set of initial</span>
<a name="l00073"></a>00073 <span class="comment"> centroids.</span>
<a name="l00074"></a>00074 <span class="comment"></span>
<a name="l00075"></a>00075 <span class="comment">The following distance functions can be used (computation of barycenter/mean in parentheses):</span>
<a name="l00076"></a>00076 <span class="comment"> - &lt;strong&gt;\ref dist_norm1&lt;/strong&gt;: 1-norm/Manhattan (element-wise median [Note that MADlib does not provide a median</span>
<a name="l00077"></a>00077 <span class="comment"> aggregate function for support and performance reasons.])</span>
<a name="l00078"></a>00078 <span class="comment"> - &lt;strong&gt;\ref dist_norm2&lt;/strong&gt;: 2-norm/Euclidean (element-wise mean)</span>
<a name="l00079"></a>00079 <span class="comment"> - &lt;strong&gt;\ref squared_dist_norm2&lt;/strong&gt;: squared Euclidean distance (element-wise mean)</span>
<a name="l00080"></a>00080 <span class="comment"> - &lt;strong&gt;\ref dist_angle&lt;/strong&gt;: angle (element-wise mean of normalized points)</span>
<a name="l00081"></a>00081 <span class="comment"> - &lt;strong&gt;\ref dist_tanimoto&lt;/strong&gt;: tanimoto (element-wise mean of normalized points [5])</span>
<a name="l00082"></a>00082 <span class="comment"> - &lt;strong&gt;user defined function&lt;/strong&gt; with signature DOUBLE PRECISION[] x DOUBLE PRECISION[] -&gt; DOUBLE PRECISION</span>
<a name="l00083"></a>00083 <span class="comment"></span>
<a name="l00084"></a>00084 <span class="comment">The following aggregate functions for determining centroids can be used:</span>
<a name="l00085"></a>00085 <span class="comment"> - &lt;strong&gt;\ref avg&lt;/strong&gt;: average</span>
<a name="l00086"></a>00086 <span class="comment"> - &lt;strong&gt;\ref normalized_avg&lt;/strong&gt;: normalized average</span>
<a name="l00087"></a>00087 <span class="comment"></span>
<a name="l00088"></a>00088 <span class="comment">The algorithm stops when one of the following conditions is met:</span>
<a name="l00089"></a>00089 <span class="comment"> - The fraction of updated points is smaller than the convergence threshold (default: 0.001).</span>
<a name="l00090"></a>00090 <span class="comment"> - The algorithm reaches the maximum number of allowed iterations (default: 20).</span>
<a name="l00091"></a>00091 <span class="comment"></span>
<a name="l00092"></a>00092 <span class="comment">A popular method to assess the quality of the clustering is the</span>
<a name="l00093"></a>00093 <span class="comment">&lt;em&gt;silhouette coefficient&lt;/em&gt;, a simplified version of which is provided as part of the k-means module.</span>
<a name="l00094"></a>00094 <span class="comment">Note that for large data sets, this computation is expensive.</span>
<a name="l00095"></a>00095 <span class="comment"></span>
<a name="l00096"></a>00096 <span class="comment">@input</span>
<a name="l00097"></a>00097 <span class="comment">The &lt;strong&gt;source relation&lt;/strong&gt; is expected to be of the following form (or</span>
<a name="l00098"></a>00098 <span class="comment">to be implicitly convertible into the following form):</span>
<a name="l00099"></a>00099 <span class="comment">&lt;pre&gt;{TABLE|VIEW} &lt;em&gt;rel_source&lt;/em&gt; (</span>
<a name="l00100"></a>00100 <span class="comment"> ...</span>
<a name="l00101"></a>00101 <span class="comment"> &lt;em&gt;expr_points&lt;/em&gt; FLOAT8[],</span>
<a name="l00102"></a>00102 <span class="comment"> ...</span>
<a name="l00103"></a>00103 <span class="comment">)&lt;/pre&gt;</span>
<a name="l00104"></a>00104 <span class="comment">where:</span>
<a name="l00105"></a>00105 <span class="comment"> - &lt;em&gt;expr_points&lt;/em&gt; is the name of a column with point coordinates.</span>
<a name="l00106"></a>00106 <span class="comment"> Types such as \c svec or &lt;tt&gt;INTEGER[]&lt;/tt&gt; are implicitly converted to</span>
<a name="l00107"></a>00107 <span class="comment"> &lt;tt&gt;FLOAT8[]&lt;/tt&gt;.</span>
<a name="l00108"></a>00108 <span class="comment"></span>
<a name="l00109"></a>00109 <span class="comment">If kmeans is called with a set of initial centroids, the centroid relation is</span>
<a name="l00110"></a>00110 <span class="comment">expected to be of the following form:</span>
<a name="l00111"></a>00111 <span class="comment">&lt;pre&gt;{TABLE|VIEW} &lt;em&gt;rel_initial_centroids&lt;/em&gt; (</span>
<a name="l00112"></a>00112 <span class="comment"> ...</span>
<a name="l00113"></a>00113 <span class="comment"> &lt;em&gt;expr_centroid&lt;/em&gt; DOUBLE PRECISION[],</span>
<a name="l00114"></a>00114 <span class="comment"> ...</span>
<a name="l00115"></a>00115 <span class="comment">)&lt;/pre&gt;</span>
<a name="l00116"></a>00116 <span class="comment">where:</span>
<a name="l00117"></a>00117 <span class="comment"> - &lt;em&gt;expr_centroid&lt;/em&gt; is the name of a column with coordinates.</span>
<a name="l00118"></a>00118 <span class="comment"></span>
<a name="l00119"></a>00119 <span class="comment">@usage</span>
<a name="l00120"></a>00120 <span class="comment">The k-means algorithm can be invoked in four possible ways:</span>
<a name="l00121"></a>00121 <span class="comment"></span>
<a name="l00122"></a>00122 <span class="comment">- using &lt;em&gt;random&lt;/em&gt; centroid seeding method for a</span>
<a name="l00123"></a>00123 <span class="comment">provided \f$ k \f$:</span>
<a name="l00124"></a>00124 <span class="comment">&lt;pre&gt;SELECT * FROM \ref kmeans_random(</span>
<a name="l00125"></a>00125 <span class="comment"> &#39;&lt;em&gt;rel_source&lt;/em&gt;&#39;, &#39;&lt;em&gt;expr_point&lt;/em&gt;&#39;, k,</span>
<a name="l00126"></a>00126 <span class="comment"> [ &#39;&lt;em&gt;fn_dist&lt;/em&gt;&#39;, &#39;&lt;em&gt;agg_centroid&lt;/em&gt;&#39;,</span>
<a name="l00127"></a>00127 <span class="comment"> &lt;em&gt;max_num_iterations&lt;/em&gt;, &lt;em&gt;min_frac_reassigned&lt;/em&gt; ]</span>
<a name="l00128"></a>00128 <span class="comment">);&lt;/pre&gt;</span>
<a name="l00129"></a>00129 <span class="comment"></span>
<a name="l00130"></a>00130 <span class="comment">- using &lt;em&gt;kmeans++&lt;/em&gt; centroid seeding method for a</span>
<a name="l00131"></a>00131 <span class="comment">provided \f$ k \f$:</span>
<a name="l00132"></a>00132 <span class="comment">&lt;pre&gt;SELECT * FROM \ref kmeanspp(</span>
<a name="l00133"></a>00133 <span class="comment"> &#39;&lt;em&gt;rel_source&lt;/em&gt;&#39;, &#39;&lt;em&gt;expr_point&lt;/em&gt;&#39;, k,</span>
<a name="l00134"></a>00134 <span class="comment"> [ &#39;&lt;em&gt;fn_dist&lt;/em&gt;&#39;, &#39;&lt;em&gt;agg_centroid&lt;/em&gt;&#39;,</span>
<a name="l00135"></a>00135 <span class="comment"> &lt;em&gt;max_num_iterations&lt;/em&gt;, &lt;em&gt;min_frac_reassigned&lt;/em&gt; ]</span>
<a name="l00136"></a>00136 <span class="comment">);&lt;/pre&gt;</span>
<a name="l00137"></a>00137 <span class="comment"></span>
<a name="l00138"></a>00138 <span class="comment">- with a provided centroid set:</span>
<a name="l00139"></a>00139 <span class="comment">&lt;pre&gt;SELECT * FROM \ref kmeans(</span>
<a name="l00140"></a>00140 <span class="comment"> &#39;&lt;em&gt;rel_source&lt;/em&gt;&#39;, &#39;&lt;em&gt;expr_point&lt;/em&gt;&#39;,</span>
<a name="l00141"></a>00141 <span class="comment"> &#39;&lt;em&gt;rel_initial_centroids&lt;/em&gt;&#39;, &#39;&lt;em&gt;expr_centroid&lt;/em&gt;&#39;,</span>
<a name="l00142"></a>00142 <span class="comment"> [ &#39;&lt;em&gt;fn_dist&lt;/em&gt;&#39;, &#39;&lt;em&gt;agg_centroid&lt;/em&gt;&#39;,</span>
<a name="l00143"></a>00143 <span class="comment"> &lt;em&gt;max_num_iterations&lt;/em&gt;, &lt;em&gt;min_frac_reassigned&lt;/em&gt; ]</span>
<a name="l00144"></a>00144 <span class="comment">);&lt;/pre&gt;</span>
<a name="l00145"></a>00145 <span class="comment">------------ OR ---------------</span>
<a name="l00146"></a>00146 <span class="comment">&lt;pre&gt;SELECT * FROM \ref kmeans(</span>
<a name="l00147"></a>00147 <span class="comment"> &#39;&lt;em&gt;rel_source&lt;/em&gt;&#39;, &#39;&lt;em&gt;expr_point&lt;/em&gt;&#39;,</span>
<a name="l00148"></a>00148 <span class="comment"> initial_centroids,</span>
<a name="l00149"></a>00149 <span class="comment"> [ &#39;&lt;em&gt;fn_dist&lt;/em&gt;&#39;, &#39;&lt;em&gt;agg_centroid&lt;/em&gt;&#39;,</span>
<a name="l00150"></a>00150 <span class="comment"> &lt;em&gt;max_num_iterations&lt;/em&gt;, &lt;em&gt;min_frac_reassigned&lt;/em&gt; ]</span>
<a name="l00151"></a>00151 <span class="comment">);&lt;/pre&gt;</span>
<a name="l00152"></a>00152 <span class="comment">where:</span>
<a name="l00153"></a>00153 <span class="comment"> - &lt;em&gt;initial_centroids&lt;/em&gt; is of type &lt;tt&gt;DOUBLE PRECISION[][]&lt;/tt&gt;.</span>
<a name="l00154"></a>00154 <span class="comment"></span>
<a name="l00155"></a>00155 <span class="comment">The output of the k-means module is a table that includes the final</span>
<a name="l00156"></a>00156 <span class="comment">centroid positions (DOUBLE PRECISION[][]), the objective function,</span>
<a name="l00157"></a>00157 <span class="comment">the fraction of reassigned points in the last iteration, and</span>
<a name="l00158"></a>00158 <span class="comment">the number of total iterations:</span>
<a name="l00159"></a>00159 <span class="comment">&lt;pre&gt;</span>
<a name="l00160"></a>00160 <span class="comment"> centroids | objective_fn | frac_reassigned | num_iterations</span>
<a name="l00161"></a>00161 <span class="comment">----------------------------------+------------------+-----------------+----------------</span>
<a name="l00162"></a>00162 <span class="comment"> ...</span>
<a name="l00163"></a>00163 <span class="comment">&lt;/pre&gt;</span>
<a name="l00164"></a>00164 <span class="comment"></span>
<a name="l00165"></a>00165 <span class="comment">@examp</span>
<a name="l00166"></a>00166 <span class="comment"></span>
<a name="l00167"></a>00167 <span class="comment">-# Prepare some input data.</span>
<a name="l00168"></a>00168 <span class="comment">\code</span>
<a name="l00169"></a>00169 <span class="comment">sql&gt; SELECT * FROM public.km_sample LIMIT 5;</span>
<a name="l00170"></a>00170 <span class="comment"> points</span>
<a name="l00171"></a>00171 <span class="comment">-------------------------------------------</span>
<a name="l00172"></a>00172 <span class="comment"> {1,1}:{15.8822241332382,105.945462542586}</span>
<a name="l00173"></a>00173 <span class="comment"> {1,1}:{34.5065216883086,72.3126099305227}</span>
<a name="l00174"></a>00174 <span class="comment"> {1,1}:{22.5074400822632,95.3209559689276}</span>
<a name="l00175"></a>00175 <span class="comment"> {1,1}:{70.2589857042767,68.7395178806037}</span>
<a name="l00176"></a>00176 <span class="comment"> {1,1}:{30.9844257542863,25.3213323024102}</span>
<a name="l00177"></a>00177 <span class="comment">(5 rows)</span>
<a name="l00178"></a>00178 <span class="comment">\endcode</span>
<a name="l00179"></a>00179 <span class="comment">Note: the example &lt;em&gt;points&lt;/em&gt; is type &lt;tt&gt;\ref grp_svec &quot;SVEC&quot;&lt;/tt&gt;.</span>
<a name="l00180"></a>00180 <span class="comment">-# Run k-means clustering using kmeans++ for centroid seeding:</span>
<a name="l00181"></a>00181 <span class="comment">\code</span>
<a name="l00182"></a>00182 <span class="comment">sql&gt; SELECT * FROM madlib.kmeanspp(&#39;km_sample&#39;, &#39;points&#39;, 2, &#39;madlib.squared_dist_norm2&#39;, &#39;madlib.avg&#39;, 20, 0.001);</span>
<a name="l00183"></a>00183 <span class="comment">);</span>
<a name="l00184"></a>00184 <span class="comment"> centroids | objective_fn | frac_reassigned | num_iterations</span>
<a name="l00185"></a>00185 <span class="comment">-------------------------------------------------------------------------+------------------+-----------------+----------------</span>
<a name="l00186"></a>00186 <span class="comment"> {{68.01668579784,48.9667382972952},{28.1452167573446,84.5992507653263}} | 586729.010675982 | 0.001 | 5</span>
<a name="l00187"></a>00187 <span class="comment">\endcode</span>
<a name="l00188"></a>00188 <span class="comment">-# Calculate the simplified silhouette coefficient:</span>
<a name="l00189"></a>00189 <span class="comment">\code</span>
<a name="l00190"></a>00190 <span class="comment">sql&gt; SELECT * from madlib.simple_silhouette(&#39;km_test_svec&#39;,&#39;points&#39;,</span>
<a name="l00191"></a>00191 <span class="comment"> (select centroids from madlib.kmeanspp(&#39;km_test_svec&#39;,&#39;points&#39;,2,&#39;madlib.squared_dist_norm2&#39;,&#39;madlib.avg&#39;,20,0.001)),</span>
<a name="l00192"></a>00192 <span class="comment"> &#39;madlib.dist_norm2&#39;);</span>
<a name="l00193"></a>00193 <span class="comment"> simple_silhouette</span>
<a name="l00194"></a>00194 <span class="comment">-------------------</span>
<a name="l00195"></a>00195 <span class="comment"> 0.611022970398174</span>
<a name="l00196"></a>00196 <span class="comment">\endcode</span>
<a name="l00197"></a>00197 <span class="comment"></span>
<a name="l00198"></a>00198 <span class="comment">@literature</span>
<a name="l00199"></a>00199 <span class="comment"></span>
<a name="l00200"></a>00200 <span class="comment">[1] Wikipedia, K-means Clustering,</span>
<a name="l00201"></a>00201 <span class="comment"> http://en.wikipedia.org/wiki/K-means_clustering</span>
<a name="l00202"></a>00202 <span class="comment"></span>
<a name="l00203"></a>00203 <span class="comment">[2] David Arthur, Sergei Vassilvitskii: k-means++: the advantages of careful</span>
<a name="l00204"></a>00204 <span class="comment"> seeding, Proceedings of the 18th Annual ACM-SIAM Symposium on Discrete</span>
<a name="l00205"></a>00205 <span class="comment"> Algorithms (SODA&#39;07), pp. 1027-1035,</span>
<a name="l00206"></a>00206 <span class="comment"> http://www.stanford.edu/~darthur/kMeansPlusPlus.pdf</span>
<a name="l00207"></a>00207 <span class="comment"></span>
<a name="l00208"></a>00208 <span class="comment">[3] E. R. Hruschka, L. N. C. Silva, R. J. G. B. Campello: Clustering</span>
<a name="l00209"></a>00209 <span class="comment"> Gene-Expression Data: A Hybrid Approach that Iterates Between k-Means and</span>
<a name="l00210"></a>00210 <span class="comment"> Evolutionary Search. In: Studies in Computational Intelligence - Hybrid</span>
<a name="l00211"></a>00211 <span class="comment"> Evolutionary Algorithms. pp. 313-335. Springer. 2007.</span>
<a name="l00212"></a>00212 <span class="comment"></span>
<a name="l00213"></a>00213 <span class="comment">[4] Lloyd, Stuart: Least squares quantization in PCM. Technical Note, Bell</span>
<a name="l00214"></a>00214 <span class="comment"> Laboratories. Published much later in: IEEE Transactions on Information</span>
<a name="l00215"></a>00215 <span class="comment"> Theory 28(2), pp. 128-137. 1982.</span>
<a name="l00216"></a>00216 <span class="comment"></span>
<a name="l00217"></a>00217 <span class="comment">[5] Leisch, Friedrich: A Toolbox for K-Centroids Cluster Analysis. In: Computational</span>
<a name="l00218"></a>00218 <span class="comment"> Statistics and Data Analysis, 51(2). pp. 526-544. 2006.</span>
<a name="l00219"></a>00219 <span class="comment"></span>
<a name="l00220"></a>00220 <span class="comment">@sa File kmeans.sql_in documenting the SQL functions.</span>
<a name="l00221"></a>00221 <span class="comment"></span>
<a name="l00222"></a>00222 <span class="comment">@internal</span>
<a name="l00223"></a>00223 <span class="comment">@sa namespace kmeans (documenting the implementation in Python)</span>
<a name="l00224"></a>00224 <span class="comment">@endinternal</span>
<a name="l00225"></a>00225 <span class="comment">*/</span>
<a name="l00226"></a>00226
<a name="l00227"></a>00227 /*
<a name="l00228"></a>00228 * @brief k-Means return type
<a name="l00229"></a>00229 *
<a name="l00230"></a>00230 * A composite value:
<a name="l00231"></a>00231 * - &lt;tt&gt;centroids&lt;/tt&gt; - Matrix containing the new \f$ l \leq k \f$
<a name="l00232"></a>00232 * repositioned centroids as columns. If this matrix has \f$ l &lt; k \f$
<a name="l00233"></a>00233 * columns, one or more old centroids no longer were closest to any point.
<a name="l00234"></a>00234 * - &lt;tt&gt;old_centroid_its&lt;/tt&gt; - The order of the centroids in
<a name="l00235"></a>00235 * &lt;tt&gt;centroid&lt;/tt&gt; is not guaranteed to be consitent across iterations.
<a name="l00236"></a>00236 * In particular, if a centroid is no longer closest to any point it can be
<a name="l00237"></a>00237 * dropped and a new centroid is added afterwards. We therefore need to map
<a name="l00238"></a>00238 * positions in &lt;tt&gt;centroids&lt;/tt&gt; to the respective positions in the
<a name="l00239"></a>00239 * previous iteration.
<a name="l00240"></a>00240 * - &lt;tt&gt;objective_fn&lt;/tt&gt; - Value of the objective function, i.e.,
<a name="l00241"></a>00241 * \f$ \sum_{x \in P} \dist(x, C)^2 \f$ where
<a name="l00242"></a>00242 * \f$ P \f$ is the set of points, \f$ C \f$ is the set of centroids, and
<a name="l00243"></a>00243 * \f$ \dist(x, C) := \min_{c \in C} \operatorname{dist}(x, c) \f$.
<a name="l00244"></a>00244 * - &lt;tt&gt;frac_reassigned&lt;/tt&gt; - Fraction of points that was assigned a
<a name="l00245"></a>00245 * different centroid in the current iteration.
<a name="l00246"></a>00246 * - &lt;tt&gt;num_iterations&lt;/tt&gt; - Number of iterations performed (so far).
<a name="l00247"></a>00247 */
<a name="l00248"></a>00248 CREATE TYPE MADLIB_SCHEMA.kmeans_result AS (
<a name="l00249"></a>00249 centroids DOUBLE PRECISION[][],
<a name="l00250"></a>00250 objective_fn DOUBLE PRECISION,
<a name="l00251"></a>00251 frac_reassigned DOUBLE PRECISION,
<a name="l00252"></a>00252 num_iterations INTEGER
<a name="l00253"></a>00253 );
<a name="l00254"></a>00254
<a name="l00255"></a>00255 /*
<a name="l00256"></a>00256 * @brief k-Means inter-iteration state type
<a name="l00257"></a>00257 *
<a name="l00258"></a>00258 * A composite value like \ref{kmeans_result}. Additional fields:
<a name="l00259"></a>00259 * - &lt;tt&gt;old_centroid_its&lt;/tt&gt; - The order of the centroids in
<a name="l00260"></a>00260 * &lt;tt&gt;centroid&lt;/tt&gt; is not guaranteed to be consitent across iterations.
<a name="l00261"></a>00261 * In particular, if a centroid is no longer closest to any point it can be
<a name="l00262"></a>00262 * dropped and a new centroid is added afterwards. We therefore need to map
<a name="l00263"></a>00263 * positions in &lt;tt&gt;centroids&lt;/tt&gt; to the respective positions in the
<a name="l00264"></a>00264 * previous iteration.
<a name="l00265"></a>00265 * - &lt;tt&gt;num_iterations&lt;/tt&gt; - Number of iterations performed (so far).
<a name="l00266"></a>00266 */
<a name="l00267"></a>00267 CREATE TYPE MADLIB_SCHEMA.kmeans_state AS (
<a name="l00268"></a>00268 centroids DOUBLE PRECISION[][],
<a name="l00269"></a>00269 old_centroid_ids INTEGER[],
<a name="l00270"></a>00270 objective_fn DOUBLE PRECISION,
<a name="l00271"></a>00271 frac_reassigned DOUBLE PRECISION
<a name="l00272"></a>00272 );
<a name="l00273"></a>00273 <span class="comment"></span>
<a name="l00274"></a>00274 <span class="comment">/**</span>
<a name="l00275"></a>00275 <span class="comment"> * @internal</span>
<a name="l00276"></a>00276 <span class="comment"> * @brief Execute a SQL command where $1, ..., $4 are substituted with the</span>
<a name="l00277"></a>00277 <span class="comment"> * given arguments.</span>
<a name="l00278"></a>00278 <span class="comment"> */</span>
<a name="l00279"></a>00279 CREATE FUNCTION MADLIB_SCHEMA.internal_execute_using_kmeans_args(
<a name="l00280"></a>00280 sql VARCHAR, DOUBLE PRECISION[][], REGPROC, INTEGER, DOUBLE PRECISION
<a name="l00281"></a>00281 ) RETURNS VOID
<a name="l00282"></a>00282 VOLATILE
<a name="l00283"></a>00283 CALLED ON NULL INPUT
<a name="l00284"></a>00284 LANGUAGE c
<a name="l00285"></a>00285 AS &#39;MODULE_PATHNAME<span class="stringliteral">&#39;, &#39;</span>exec_sql_using<span class="stringliteral">&#39;;</span>
<a name="l00286"></a>00286 <span class="stringliteral"></span>
<a name="l00287"></a>00287 <span class="stringliteral">CREATE FUNCTION MADLIB_SCHEMA.internal_compute_kmeans(</span>
<a name="l00288"></a>00288 <span class="stringliteral"> rel_args VARCHAR,</span>
<a name="l00289"></a>00289 <span class="stringliteral"> rel_state VARCHAR,</span>
<a name="l00290"></a>00290 <span class="stringliteral"> rel_source VARCHAR,</span>
<a name="l00291"></a>00291 <span class="stringliteral"> expr_point VARCHAR,</span>
<a name="l00292"></a>00292 <span class="stringliteral"> agg_centroid VARCHAR)</span>
<a name="l00293"></a>00293 <span class="stringliteral">RETURNS INTEGER</span>
<a name="l00294"></a>00294 <span class="stringliteral">VOLATILE</span>
<a name="l00295"></a>00295 <span class="stringliteral">LANGUAGE plpythonu</span>
<a name="l00296"></a>00296 <span class="stringliteral">AS $$PythonFunction(kmeans, kmeans, compute_kmeans)$$;</span>
<a name="l00297"></a>00297 <span class="stringliteral"></span><span class="comment"></span>
<a name="l00298"></a>00298 <span class="comment">/**</span>
<a name="l00299"></a>00299 <span class="comment"> * @brief Perform Lloyd&#39;s k-means local-search heuristic</span>
<a name="l00300"></a>00300 <span class="comment"> *</span>
<a name="l00301"></a>00301 <span class="comment"> * @param rel_source Name of the relation containing input points</span>
<a name="l00302"></a>00302 <span class="comment"> * @param expr_point Expression evaluating to point coordinates for each tuple</span>
<a name="l00303"></a>00303 <span class="comment"> * @param initial_centroids Matrix containing the initial centroids as columns</span>
<a name="l00304"></a>00304 <span class="comment"> * @param fn_dist Name of a function with signature</span>
<a name="l00305"></a>00305 <span class="comment"> * &lt;tt&gt;DOUBLE PRECISION[] x DOUBLE PRECISION[] -&gt; DOUBLE PRECISION&lt;/tt&gt; that</span>
<a name="l00306"></a>00306 <span class="comment"> * returns the distance between two points. The default is the</span>
<a name="l00307"></a>00307 <span class="comment"> * \ref squared_dist_norm2(float8[],float8[]) &quot;squared Euclidean distance&quot;.</span>
<a name="l00308"></a>00308 <span class="comment"> * @param agg_centroid Name of an aggregate function with signature</span>
<a name="l00309"></a>00309 <span class="comment"> * &lt;tt&gt;DOUBLE PRECISION[] -&gt; DOUBLE PRECISION[]&lt;/tt&gt; that, for each group</span>
<a name="l00310"></a>00310 <span class="comment"> * of points, returns a centroid. In order for Lloyd&#39;s local-search</span>
<a name="l00311"></a>00311 <span class="comment"> * heuristic to provably converge and to return a local minimum, this</span>
<a name="l00312"></a>00312 <span class="comment"> * centroid should minimize the sum of distances between each point in the</span>
<a name="l00313"></a>00313 <span class="comment"> * group and the centroid. The default is the</span>
<a name="l00314"></a>00314 <span class="comment"> * \ref avg(float8[]) &quot;average (mean/barycenter in Euclidean space)&quot;,</span>
<a name="l00315"></a>00315 <span class="comment"> * which satisfies this property if &lt;tt&gt;fn_dist = &#39;squared_dist_norm2&#39;&lt;/tt&gt;.</span>
<a name="l00316"></a>00316 <span class="comment"> * @param max_num_iterations Maximum number of iterations</span>
<a name="l00317"></a>00317 <span class="comment"> * @param min_frac_reassigned Fraction of reassigned points below which</span>
<a name="l00318"></a>00318 <span class="comment"> * convergence is assumed and the algorithm terminates</span>
<a name="l00319"></a>00319 <span class="comment"> * @returns A composite value:</span>
<a name="l00320"></a>00320 <span class="comment"> * - &lt;tt&gt;centroids&lt;/tt&gt; - Matrix with \f$ k \f$ centroids as columns.</span>
<a name="l00321"></a>00321 <span class="comment"> * - &lt;tt&gt;frac_reassigned&lt;/tt&gt; - Fraction of points that were assigned a</span>
<a name="l00322"></a>00322 <span class="comment"> * different centroid in the last iteration.</span>
<a name="l00323"></a>00323 <span class="comment"> * - &lt;tt&gt;num_iterations&lt;/tt&gt; - The number of iterations before the</span>
<a name="l00324"></a>00324 <span class="comment"> * algorithm terminated</span>
<a name="l00325"></a>00325 <span class="comment"> */</span>
<a name="l00326"></a>00326 CREATE FUNCTION MADLIB_SCHEMA.kmeans(
<a name="l00327"></a>00327 rel_source VARCHAR,
<a name="l00328"></a>00328 expr_point VARCHAR,
<a name="l00329"></a>00329 initial_centroids DOUBLE PRECISION[][],
<a name="l00330"></a>00330 fn_dist VARCHAR /*+ DEFAULT &#39;<a class="code" href="linalg_8sql__in.html#a00a08e69f27524f2096032214e15b668" title="Squared 2-norm of the difference between two vectors.">squared_dist_norm2</a><span class="stringliteral">&#39; */,</span>
<a name="l00331"></a>00331 <span class="stringliteral"> agg_centroid VARCHAR /*+ DEFAULT &#39;</span><a class="code" href="linalg_8sql__in.html#a1aa37f73fb1cd8d7d106aa518dd8c0b4" title="Compute the average of vectors.">avg</a><span class="stringliteral">&#39; */,</span>
<a name="l00332"></a>00332 <span class="stringliteral"> max_num_iterations INTEGER /*+ DEFAULT 20 */,</span>
<a name="l00333"></a>00333 <span class="stringliteral"> min_frac_reassigned DOUBLE PRECISION /*+ DEFAULT 0.001 */</span>
<a name="l00334"></a>00334 <span class="stringliteral">) RETURNS MADLIB_SCHEMA.kmeans_result AS $$</span>
<a name="l00335"></a>00335 <span class="stringliteral">DECLARE</span>
<a name="l00336"></a>00336 <span class="stringliteral"> theIteration INTEGER;</span>
<a name="l00337"></a>00337 <span class="stringliteral"> theResult MADLIB_SCHEMA.kmeans_result;</span>
<a name="l00338"></a>00338 <span class="stringliteral"> oldClientMinMessages VARCHAR;</span>
<a name="l00339"></a>00339 <span class="stringliteral"> class_rel_source REGCLASS;</span>
<a name="l00340"></a>00340 <span class="stringliteral"> proc_fn_dist REGPROCEDURE;</span>
<a name="l00341"></a>00341 <span class="stringliteral"> proc_agg_centroid REGPROCEDURE;</span>
<a name="l00342"></a>00342 <span class="stringliteral"> rel_filtered VARCHAR;</span>
<a name="l00343"></a>00343 <span class="stringliteral"> num_points INTEGER;</span>
<a name="l00344"></a>00344 <span class="stringliteral"> k INTEGER;</span>
<a name="l00345"></a>00345 <span class="stringliteral"> centroids FLOAT8[];</span>
<a name="l00346"></a>00346 <span class="stringliteral">BEGIN</span>
<a name="l00347"></a>00347 <span class="stringliteral"> IF (array_upper(initial_centroids,1) IS NULL) THEN</span>
<a name="l00348"></a><a class="code" href="kmeans_8sql__in.html#ae8bb21bf12220aa9de82792376afab7d">00348</a> <span class="stringliteral"> RAISE EXCEPTION &#39;</span>No valid initial centroids given.<span class="stringliteral">&#39;;</span>
<a name="l00349"></a>00349 <span class="stringliteral"> END IF;</span>
<a name="l00350"></a>00350 <span class="stringliteral"></span>
<a name="l00351"></a>00351 <span class="stringliteral"> centroids := ARRAY(SELECT unnest(initial_centroids));</span>
<a name="l00352"></a>00352 <span class="stringliteral"> IF (SELECT MADLIB_SCHEMA.svec_elsum(centroids)) &gt;= &#39;</span>Infinity<span class="stringliteral">&#39;::float THEN</span>
<a name="l00353"></a>00353 <span class="stringliteral"> RAISE EXCEPTION &#39;</span>At least one initial centroid has non-finite values.<span class="stringliteral">&#39;;</span>
<a name="l00354"></a>00354 <span class="stringliteral"> END IF;</span>
<a name="l00355"></a>00355 <span class="stringliteral"></span>
<a name="l00356"></a>00356 <span class="stringliteral"> rel_filtered = MADLIB_SCHEMA.__filter_input_relation(rel_source, expr_point);</span>
<a name="l00357"></a>00357 <span class="stringliteral"> class_rel_source := rel_filtered;</span>
<a name="l00358"></a>00358 <span class="stringliteral"> proc_fn_dist := fn_dist</span>
<a name="l00359"></a>00359 <span class="stringliteral"> || &#39;</span>(DOUBLE PRECISION[], DOUBLE PRECISION[])<span class="stringliteral">&#39;;</span>
<a name="l00360"></a>00360 <span class="stringliteral"> IF (SELECT prorettype != &#39;</span>DOUBLE PRECISION<span class="stringliteral">&#39;::regtype OR proisagg = TRUE</span>
<a name="l00361"></a>00361 <span class="stringliteral"> FROM pg_proc WHERE oid = proc_fn_dist) THEN</span>
<a name="l00362"></a>00362 <span class="stringliteral"> RAISE EXCEPTION &#39;</span>Distance <span class="keyword">function</span> has wrong signature or is not a simple <span class="keyword">function</span>.<span class="stringliteral">&#39;;</span>
<a name="l00363"></a>00363 <span class="stringliteral"> END IF;</span>
<a name="l00364"></a>00364 <span class="stringliteral"> proc_agg_centroid := agg_centroid || &#39;</span>(DOUBLE PRECISION[])<span class="stringliteral">&#39;;</span>
<a name="l00365"></a>00365 <span class="stringliteral"> IF (SELECT prorettype != &#39;</span>DOUBLE PRECISION[]<span class="stringliteral">&#39;::regtype OR proisagg = FALSE</span>
<a name="l00366"></a>00366 <span class="stringliteral"> FROM pg_proc WHERE oid = proc_agg_centroid) THEN</span>
<a name="l00367"></a>00367 <span class="stringliteral"> RAISE EXCEPTION &#39;</span>Mean aggregate has wrong signature or is not an aggregate.<span class="stringliteral">&#39;;</span>
<a name="l00368"></a>00368 <span class="stringliteral"> END IF;</span>
<a name="l00369"></a>00369 <span class="stringliteral"> IF (min_frac_reassigned &lt; 0) OR (min_frac_reassigned &gt; 1) THEN</span>
<a name="l00370"></a>00370 <span class="stringliteral"> RAISE EXCEPTION &#39;</span>Convergence threshold is not a valid value (must be a fraction between 0 and 1).<span class="stringliteral">&#39;;</span>
<a name="l00371"></a>00371 <span class="stringliteral"> END IF;</span>
<a name="l00372"></a>00372 <span class="stringliteral"> IF (max_num_iterations &lt; 0) THEN</span>
<a name="l00373"></a>00373 <span class="stringliteral"> RAISE EXCEPTION &#39;</span>Number of iterations must be a non-negative integer.<span class="stringliteral">&#39;;</span>
<a name="l00374"></a>00374 <span class="stringliteral"> END IF;</span>
<a name="l00375"></a>00375 <span class="stringliteral"></span>
<a name="l00376"></a>00376 <span class="stringliteral"> -- Extra parameter check added so that ERROR output is more user-readable (doesn&#39;</span>t include Python traceback)
<a name="l00377"></a>00377 k := array_upper(initial_centroids,1);
<a name="l00378"></a>00378 IF (k &lt;= 0) THEN
<a name="l00379"></a>00379 RAISE EXCEPTION &#39;Number of clusters k must be a positive integer.&#39;;
<a name="l00380"></a>00380 END IF;
<a name="l00381"></a>00381 IF (k &gt; 32767) THEN
<a name="l00382"></a>00382 RAISE EXCEPTION &#39;Number of clusters k must be &lt;= 32767 (for results to be returned in a reasonable amount of time).&#39;;
<a name="l00383"></a>00383 END IF;
<a name="l00384"></a>00384 EXECUTE $sql$ SELECT count(*) FROM $sql$ || textin(regclassout(class_rel_source)) INTO num_points ;
<a name="l00385"></a>00385 IF (num_points &lt; k) THEN
<a name="l00386"></a>00386 RAISE EXCEPTION &#39;Number of centroids is greater than number of points.&#39;;
<a name="l00387"></a>00387 END IF;
<a name="l00388"></a>00388
<a name="l00389"></a>00389 -- We first setup the argument table. Rationale: We want to avoid all data
<a name="l00390"></a>00390 -- conversion between native types and Python code. Instead, we use Python
<a name="l00391"></a>00391 -- as a pure driver layer.
<a name="l00392"></a>00392 PERFORM MADLIB_SCHEMA.<a class="code" href="utilities_8sql__in.html#a56501b6f9fabe65d7a6a6beb70a0e000" title="Create the temporary schema if it does not exist yet.">create_schema_pg_temp</a>();
<a name="l00393"></a>00393 oldClientMinMessages :=
<a name="l00394"></a>00394 (SELECT setting FROM pg_settings WHERE name = &#39;client_min_messages&#39;);
<a name="l00395"></a>00395 EXECUTE &#39;SET client_min_messages TO warning&#39;;
<a name="l00396"></a>00396
<a name="l00397"></a>00397 -- Unfortunately, the EXECUTE USING syntax is only available starting
<a name="l00398"></a>00398 -- PostgreSQL 8.4:
<a name="l00399"></a>00399 -- http:<span class="comment">//www.postgresql.org/docs/8.4/static/plpgsql-statements.html#PLPGSQL-STATEMENTS-EXECUTING-DYN</span>
<a name="l00400"></a>00400 -- We therefore have to emulate.
<a name="l00401"></a>00401 PERFORM MADLIB_SCHEMA.internal_execute_using_kmeans_args($sql$
<a name="l00402"></a>00402 DROP TABLE IF EXISTS pg_temp._madlib_kmeans_args;
<a name="l00403"></a>00403 CREATE TABLE pg_temp._madlib_kmeans_args AS
<a name="l00404"></a>00404 SELECT
<a name="l00405"></a>00405 $1 AS initial_centroids, array_upper($1, 1) AS k,
<a name="l00406"></a>00406 $2 AS fn_dist, $3 AS max_num_iterations,
<a name="l00407"></a>00407 $4 AS min_frac_reassigned;
<a name="l00408"></a>00408 $sql$,
<a name="l00409"></a>00409 initial_centroids, proc_fn_dist, max_num_iterations,
<a name="l00410"></a>00410 min_frac_reassigned);
<a name="l00411"></a>00411 EXECUTE &#39;SET client_min_messages TO &#39; || oldClientMinMessages;
<a name="l00412"></a>00412
<a name="l00413"></a>00413 -- Perform acutal computation.
<a name="l00414"></a>00414 -- Unfortunately, Greenplum and PostgreSQL &lt;= 8.2 do not have conversion
<a name="l00415"></a>00415 -- operators from regclass to varchar/text.
<a name="l00416"></a>00416 theIteration := MADLIB_SCHEMA.internal_compute_kmeans(&#39;_madlib_kmeans_args&#39;,
<a name="l00417"></a>00417 &#39;_madlib_kmeans_state&#39;,
<a name="l00418"></a>00418 textin(regclassout(class_rel_source)), expr_point,
<a name="l00419"></a>00419 textin(regprocout(proc_agg_centroid)));
<a name="l00420"></a>00420
<a name="l00421"></a>00421 -- Retrieve result from state table and return it
<a name="l00422"></a>00422 EXECUTE
<a name="l00423"></a>00423 $sql$
<a name="l00424"></a>00424 SELECT (_state).centroids, (_state).objective_fn,
<a name="l00425"></a>00425 (_state).frac_reassigned, NULL
<a name="l00426"></a>00426 FROM _madlib_kmeans_state
<a name="l00427"></a>00427 WHERE _iteration = $sql$ || theIteration || $sql$
<a name="l00428"></a>00428 $sql$
<a name="l00429"></a>00429 INTO theResult;
<a name="l00430"></a>00430 -- The number of iterations are not updated in the C++ code. We do it here.
<a name="l00431"></a>00431 IF NOT (theResult IS NULL) THEN
<a name="l00432"></a>00432 theResult.num_iterations = theIteration;
<a name="l00433"></a>00433 END IF;
<a name="l00434"></a>00434 RETURN theResult;
<a name="l00435"></a>00435 END;
<a name="l00436"></a>00436 $$ LANGUAGE plpgsql VOLATILE;
<a name="l00437"></a>00437
<a name="l00438"></a>00438 CREATE FUNCTION MADLIB_SCHEMA.<a class="code" href="kmeans_8sql__in.html#ae8bb21bf12220aa9de82792376afab7d" title="Perform Lloyd&#39;s k-means local-search heuristic.">kmeans</a>(
<a name="l00439"></a>00439 rel_source VARCHAR,
<a name="l00440"></a>00440 expr_point VARCHAR,
<a name="l00441"></a>00441 initial_centroids DOUBLE PRECISION[][],
<a name="l00442"></a>00442 fn_dist VARCHAR,
<a name="l00443"></a>00443 agg_centroid VARCHAR,
<a name="l00444"></a>00444 max_num_iterations INTEGER
<a name="l00445"></a>00445 ) RETURNS MADLIB_SCHEMA.kmeans_result
<a name="l00446"></a>00446 VOLATILE
<a name="l00447"></a>00447 STRICT
<a name="l00448"></a>00448 LANGUAGE sql AS $$
<a name="l00449"></a>00449 SELECT MADLIB_SCHEMA.<a class="code" href="kmeans_8sql__in.html#ae8bb21bf12220aa9de82792376afab7d" title="Perform Lloyd&#39;s k-means local-search heuristic.">kmeans</a>($1, $2, $3, $4, $5, $6, 0.001)
<a name="l00450"></a>00450 $$;
<a name="l00451"></a>00451
<a name="l00452"></a>00452 CREATE FUNCTION MADLIB_SCHEMA.kmeans(
<a name="l00453"></a>00453 rel_source VARCHAR,
<a name="l00454"></a>00454 expr_point VARCHAR,
<a name="l00455"></a>00455 initial_centroids DOUBLE PRECISION[][],
<a name="l00456"></a>00456 fn_dist VARCHAR,
<a name="l00457"></a>00457 agg_centroid VARCHAR
<a name="l00458"></a>00458 ) RETURNS MADLIB_SCHEMA.kmeans_result
<a name="l00459"></a>00459 VOLATILE
<a name="l00460"></a>00460 STRICT
<a name="l00461"></a>00461 LANGUAGE sql AS $$
<a name="l00462"></a>00462 SELECT MADLIB_SCHEMA.kmeans($1, $2, $3, $4, $5, 20, 0.001)
<a name="l00463"></a>00463 $$;
<a name="l00464"></a>00464
<a name="l00465"></a>00465 CREATE FUNCTION MADLIB_SCHEMA.kmeans(
<a name="l00466"></a>00466 rel_source VARCHAR,
<a name="l00467"></a>00467 expr_point VARCHAR,
<a name="l00468"></a>00468 initial_centroids DOUBLE PRECISION[][],
<a name="l00469"></a>00469 fn_dist VARCHAR
<a name="l00470"></a>00470 ) RETURNS MADLIB_SCHEMA.kmeans_result
<a name="l00471"></a>00471 VOLATILE
<a name="l00472"></a>00472 STRICT
<a name="l00473"></a>00473 LANGUAGE sql AS $$
<a name="l00474"></a>00474 SELECT MADLIB_SCHEMA.kmeans($1, $2, $3, $4, &#39;MADLIB_SCHEMA.<a class="code" href="linalg_8sql__in.html#a1aa37f73fb1cd8d7d106aa518dd8c0b4" title="Compute the average of vectors.">avg</a>&#39;, 20,
<a name="l00475"></a>00475 0.001)
<a name="l00476"></a>00476 $$;
<a name="l00477"></a>00477
<a name="l00478"></a>00478 CREATE FUNCTION MADLIB_SCHEMA.kmeans(
<a name="l00479"></a>00479 rel_source VARCHAR,
<a name="l00480"></a>00480 expr_point VARCHAR,
<a name="l00481"></a>00481 initial_centroids DOUBLE PRECISION[][]
<a name="l00482"></a>00482 ) RETURNS MADLIB_SCHEMA.kmeans_result
<a name="l00483"></a>00483 VOLATILE
<a name="l00484"></a>00484 STRICT
<a name="l00485"></a>00485 LANGUAGE sql AS $$
<a name="l00486"></a>00486 SELECT MADLIB_SCHEMA.kmeans($1, $2, $3,
<a name="l00487"></a>00487 &#39;MADLIB_SCHEMA.<a class="code" href="linalg_8sql__in.html#a00a08e69f27524f2096032214e15b668" title="Squared 2-norm of the difference between two vectors.">squared_dist_norm2</a>&#39;, &#39;MADLIB_SCHEMA.avg&#39;, 20, 0.001)
<a name="l00488"></a>00488 $$;
<a name="l00489"></a>00489
<a name="l00490"></a>00490 CREATE FUNCTION MADLIB_SCHEMA.internal_execute_using_kmeanspp_seeding_args(
<a name="l00491"></a>00491 sql VARCHAR, INTEGER, REGPROC, DOUBLE PRECISION[][]
<a name="l00492"></a>00492 ) RETURNS VOID
<a name="l00493"></a>00493 VOLATILE
<a name="l00494"></a>00494 CALLED ON NULL INPUT
<a name="l00495"></a>00495 LANGUAGE c
<a name="l00496"></a>00496 AS &#39;MODULE_PATHNAME&#39;, &#39;exec_sql_using&#39;;
<a name="l00497"></a>00497
<a name="l00498"></a>00498 CREATE FUNCTION MADLIB_SCHEMA.internal_compute_kmeanspp_seeding(
<a name="l00499"></a>00499 rel_args VARCHAR,
<a name="l00500"></a>00500 rel_state VARCHAR,
<a name="l00501"></a>00501 rel_source VARCHAR,
<a name="l00502"></a>00502 expr_point VARCHAR)
<a name="l00503"></a>00503 RETURNS INTEGER
<a name="l00504"></a>00504 AS $$PythonFunction(kmeans, kmeans, compute_kmeanspp_seeding)$$
<a name="l00505"></a>00505 LANGUAGE plpythonu VOLATILE;
<a name="l00506"></a>00506 <span class="comment"></span>
<a name="l00507"></a>00507 <span class="comment">/**</span>
<a name="l00508"></a>00508 <span class="comment"> * @brief k-Means++ Seeding</span>
<a name="l00509"></a>00509 <span class="comment"> *</span>
<a name="l00510"></a>00510 <span class="comment"> * @param rel_source Name of the relation containing input points</span>
<a name="l00511"></a>00511 <span class="comment"> * @param expr_point Expression evaluating to point coordinates for each tuple</span>
<a name="l00512"></a>00512 <span class="comment"> * @param k Number of centroids</span>
<a name="l00513"></a>00513 <span class="comment"> * @param fn_dist Name of a function with signature</span>
<a name="l00514"></a>00514 <span class="comment"> * &lt;tt&gt;DOUBLE PRECISION[] x DOUBLE PRECISION[] -&gt; DOUBLE PRECISION&lt;/tt&gt; that</span>
<a name="l00515"></a>00515 <span class="comment"> * returns the distance between two points</span>
<a name="l00516"></a>00516 <span class="comment"> * @param initial_centroids A matrix containing up to \f$ k \f$ columns as</span>
<a name="l00517"></a>00517 <span class="comment"> * columns. kmeanspp_seeding() proceeds exactly as if these centroids had</span>
<a name="l00518"></a>00518 <span class="comment"> * already been generated in previous iterations. This parameter may be</span>
<a name="l00519"></a>00519 <span class="comment"> * NULL in which all \f$ k \f$ centroids will be generated.</span>
<a name="l00520"></a>00520 <span class="comment"> * @returns A matrix containing \f$ k \f$ centroids as columns</span>
<a name="l00521"></a>00521 <span class="comment"> */</span>
<a name="l00522"></a>00522 CREATE FUNCTION MADLIB_SCHEMA.<a class="code" href="kmeans_8sql__in.html#af0d5172211c83d4de4d70a84555aa68e" title="k-Means++ Seeding">kmeanspp_seeding</a>(
<a name="l00523"></a>00523 rel_source VARCHAR,
<a name="l00524"></a>00524 expr_point VARCHAR,
<a name="l00525"></a>00525 k INTEGER,
<a name="l00526"></a>00526 fn_dist VARCHAR <span class="comment">/*+ DEFAULT &#39;squared_dist_norm2&#39; */</span>,
<a name="l00527"></a>00527 initial_centroids DOUBLE PRECISION[][] <span class="comment">/*+ DEFAULT NULL */</span>
<a name="l00528"></a>00528 ) RETURNS DOUBLE PRECISION[][] AS $$
<a name="l00529"></a>00529 DECLARE
<a name="l00530"></a>00530 theIteration INTEGER;
<a name="l00531"></a>00531 theResult DOUBLE PRECISION[][];
<a name="l00532"></a>00532 oldClientMinMessages VARCHAR;
<a name="l00533"></a>00533 class_rel_source REGCLASS;
<a name="l00534"></a>00534 proc_fn_dist REGPROCEDURE;
<a name="l00535"></a>00535 num_points INTEGER;
<a name="l00536"></a>00536 num_centroids INTEGER;
<a name="l00537"></a>00537 rel_filtered VARCHAR;
<a name="l00538"></a>00538 BEGIN
<a name="l00539"></a>00539 rel_filtered = MADLIB_SCHEMA.__filter_input_relation(rel_source, expr_point);
<a name="l00540"></a>00540 class_rel_source := rel_filtered;
<a name="l00541"></a>00541
<a name="l00542"></a>00542 IF (initial_centroids IS NOT NULL) THEN
<a name="l00543"></a>00543 num_centroids := array_upper(initial_centroids,1);
<a name="l00544"></a><a class="code" href="kmeans_8sql__in.html#af0d5172211c83d4de4d70a84555aa68e">00544</a> ELSE
<a name="l00545"></a>00545 num_centroids := k;
<a name="l00546"></a>00546 END IF;
<a name="l00547"></a>00547
<a name="l00548"></a>00548 proc_fn_dist := fn_dist
<a name="l00549"></a>00549 || &#39;(DOUBLE PRECISION[], DOUBLE PRECISION[])&#39;;
<a name="l00550"></a>00550 IF (SELECT prorettype != &#39;DOUBLE PRECISION&#39;::regtype OR proisagg = TRUE
<a name="l00551"></a>00551 FROM pg_proc WHERE oid = proc_fn_dist) THEN
<a name="l00552"></a>00552 RAISE EXCEPTION &#39;Distance function has wrong signature or is not a simple function.&#39;;
<a name="l00553"></a>00553 END IF;
<a name="l00554"></a>00554 IF (k &lt;= 0) THEN
<a name="l00555"></a>00555 RAISE EXCEPTION &#39;Number of clusters k must be a positive integer.&#39;;
<a name="l00556"></a>00556 END IF;
<a name="l00557"></a>00557 IF (k &gt; 32767) THEN
<a name="l00558"></a>00558 RAISE EXCEPTION &#39;Number of clusters k must be &lt;= 32767 (for results to be returned in a reasonable amount of time).&#39;;
<a name="l00559"></a>00559 END IF;
<a name="l00560"></a>00560 EXECUTE $sql$ SELECT count(*) FROM $sql$ || textin(regclassout(class_rel_source)) INTO num_points ;
<a name="l00561"></a>00561 IF (num_points &lt; k OR num_points &lt; num_centroids) THEN
<a name="l00562"></a>00562 RAISE EXCEPTION &#39;Number of centroids is greater than number of points.&#39;;
<a name="l00563"></a>00563 END IF;
<a name="l00564"></a>00564 IF (k &lt; num_centroids) THEN
<a name="l00565"></a>00565 RAISE WARNING &#39;Number of clusters k is less than number of supplied initial centroids. Number of final clusters will equal number of supplied initial centroids.&#39;;
<a name="l00566"></a>00566 END IF;
<a name="l00567"></a>00567
<a name="l00568"></a>00568 -- We first setup the argument table. Rationale: We want to avoid all data
<a name="l00569"></a>00569 -- conversion between native types and Python code. Instead, we use Python
<a name="l00570"></a>00570 -- as a pure driver layer.
<a name="l00571"></a>00571 oldClientMinMessages :=
<a name="l00572"></a>00572 (SELECT setting FROM pg_settings WHERE name = &#39;client_min_messages&#39;);
<a name="l00573"></a>00573 EXECUTE &#39;SET client_min_messages TO warning&#39;;
<a name="l00574"></a>00574 PERFORM MADLIB_SCHEMA.create_schema_pg_temp();
<a name="l00575"></a>00575 -- Unfortunately, the EXECUTE USING syntax is only available starting
<a name="l00576"></a>00576 -- PostgreSQL 8.4:
<a name="l00577"></a>00577 -- http:<span class="comment">//www.postgresql.org/docs/8.4/static/plpgsql-statements.html#PLPGSQL-STATEMENTS-EXECUTING-DYN</span>
<a name="l00578"></a>00578 -- We therefore have to emulate.
<a name="l00579"></a>00579 PERFORM MADLIB_SCHEMA.internal_execute_using_kmeanspp_seeding_args($sql$
<a name="l00580"></a>00580 DROP TABLE IF EXISTS pg_temp._madlib_kmeanspp_args;
<a name="l00581"></a>00581 CREATE TEMPORARY TABLE _madlib_kmeanspp_args AS
<a name="l00582"></a>00582 SELECT $1 AS k, $2 AS fn_dist, $3 AS initial_centroids;
<a name="l00583"></a>00583 $sql$,
<a name="l00584"></a>00584 k, proc_fn_dist, initial_centroids);
<a name="l00585"></a>00585 EXECUTE &#39;SET client_min_messages TO &#39; || oldClientMinMessages;
<a name="l00586"></a>00586
<a name="l00587"></a>00587 -- Perform acutal computation.
<a name="l00588"></a>00588 -- Unfortunately, Greenplum and PostgreSQL &lt;= 8.2 do not have conversion
<a name="l00589"></a>00589 -- operators from regclass to varchar/text.
<a name="l00590"></a>00590 theIteration := (
<a name="l00591"></a>00591 SELECT MADLIB_SCHEMA.internal_compute_kmeanspp_seeding(
<a name="l00592"></a>00592 &#39;_madlib_kmeanspp_args&#39;, &#39;_madlib_kmeanspp_state&#39;,
<a name="l00593"></a>00593 textin(regclassout(class_rel_source)), expr_point)
<a name="l00594"></a>00594 );
<a name="l00595"></a>00595
<a name="l00596"></a>00596 -- Retrieve result from state table and return it
<a name="l00597"></a>00597 EXECUTE
<a name="l00598"></a>00598 $sql$
<a name="l00599"></a>00599 SELECT _state FROM _madlib_kmeanspp_state
<a name="l00600"></a>00600 WHERE _iteration = $sql$ || theIteration || $sql$
<a name="l00601"></a>00601 $sql$
<a name="l00602"></a>00602 INTO theResult;
<a name="l00603"></a>00603 RETURN theResult;
<a name="l00604"></a>00604 END;
<a name="l00605"></a>00605 $$ LANGUAGE plpgsql VOLATILE;
<a name="l00606"></a>00606
<a name="l00607"></a>00607 CREATE FUNCTION MADLIB_SCHEMA.<a class="code" href="kmeans_8sql__in.html#af0d5172211c83d4de4d70a84555aa68e" title="k-Means++ Seeding">kmeanspp_seeding</a>(
<a name="l00608"></a>00608 rel_source VARCHAR,
<a name="l00609"></a>00609 expr_point VARCHAR,
<a name="l00610"></a>00610 k INTEGER,
<a name="l00611"></a>00611 fn_dist VARCHAR
<a name="l00612"></a>00612 ) RETURNS DOUBLE PRECISION[][]
<a name="l00613"></a>00613 LANGUAGE sql AS $$
<a name="l00614"></a>00614 SELECT MADLIB_SCHEMA.<a class="code" href="kmeans_8sql__in.html#af0d5172211c83d4de4d70a84555aa68e" title="k-Means++ Seeding">kmeanspp_seeding</a>($1, $2, $3, $4, NULL)
<a name="l00615"></a>00615 $$;
<a name="l00616"></a>00616
<a name="l00617"></a>00617 CREATE FUNCTION MADLIB_SCHEMA.kmeanspp_seeding(
<a name="l00618"></a>00618 rel_source VARCHAR,
<a name="l00619"></a>00619 expr_point VARCHAR,
<a name="l00620"></a>00620 k INTEGER
<a name="l00621"></a>00621 ) RETURNS DOUBLE PRECISION[][]
<a name="l00622"></a>00622 LANGUAGE sql AS $$
<a name="l00623"></a>00623 SELECT MADLIB_SCHEMA.kmeanspp_seeding($1, $2, $3,
<a name="l00624"></a>00624 &#39;MADLIB_SCHEMA.squared_dist_norm2&#39;, NULL)
<a name="l00625"></a>00625 $$;
<a name="l00626"></a>00626 <span class="comment"></span>
<a name="l00627"></a>00627 <span class="comment">/**</span>
<a name="l00628"></a>00628 <span class="comment"> * @brief Run k-Means++.</span>
<a name="l00629"></a>00629 <span class="comment"> *</span>
<a name="l00630"></a>00630 <span class="comment"> * This is a shortcut for running k-means++. It is equivalent to</span>
<a name="l00631"></a>00631 <span class="comment"> * &lt;pre&gt;SELECT \ref kmeans(</span>
<a name="l00632"></a>00632 <span class="comment"> rel_source,</span>
<a name="l00633"></a>00633 <span class="comment"> expr_point,</span>
<a name="l00634"></a>00634 <span class="comment"> \ref kmeanspp_seeding(</span>
<a name="l00635"></a>00635 <span class="comment"> rel_source,</span>
<a name="l00636"></a>00636 <span class="comment"> expr_point,</span>
<a name="l00637"></a>00637 <span class="comment"> k,</span>
<a name="l00638"></a>00638 <span class="comment"> fn_dist</span>
<a name="l00639"></a>00639 <span class="comment"> ),</span>
<a name="l00640"></a>00640 <span class="comment"> fn_dist,</span>
<a name="l00641"></a>00641 <span class="comment"> agg_centroid,</span>
<a name="l00642"></a>00642 <span class="comment"> max_num_iterations,</span>
<a name="l00643"></a>00643 <span class="comment"> min_frac_reassigned</span>
<a name="l00644"></a>00644 <span class="comment">)&lt;/pre&gt;</span>
<a name="l00645"></a>00645 <span class="comment"> */</span>
<a name="l00646"></a>00646 CREATE FUNCTION MADLIB_SCHEMA.<a class="code" href="kmeans_8sql__in.html#ac6c26c8e6b4643acfa79a87bd3ab0fe4" title="Run k-Means++.">kmeanspp</a>(
<a name="l00647"></a>00647 rel_source VARCHAR,
<a name="l00648"></a>00648 expr_point VARCHAR,
<a name="l00649"></a>00649 k INTEGER,
<a name="l00650"></a>00650 fn_dist VARCHAR <span class="comment">/*+ DEFAULT &#39;squared_dist_norm2&#39; */</span>,
<a name="l00651"></a>00651 agg_centroid VARCHAR <span class="comment">/*+ DEFAULT &#39;avg&#39; */</span>,
<a name="l00652"></a>00652 max_num_iterations INTEGER <span class="comment">/*+ DEFAULT 20 */</span>,
<a name="l00653"></a>00653 min_frac_reassigned DOUBLE PRECISION <span class="comment">/*+ DEFAULT 0.001 */</span>
<a name="l00654"></a>00654 ) RETURNS MADLIB_SCHEMA.kmeans_result
<a name="l00655"></a>00655 VOLATILE
<a name="l00656"></a>00656 STRICT
<a name="l00657"></a>00657 LANGUAGE plpgsql
<a name="l00658"></a>00658 AS $$
<a name="l00659"></a>00659 DECLARE
<a name="l00660"></a>00660 ret MADLIB_SCHEMA.kmeans_result;
<a name="l00661"></a>00661 BEGIN
<a name="l00662"></a>00662 ret = MADLIB_SCHEMA.kmeans(
<a name="l00663"></a>00663 $1, $2, MADLIB_SCHEMA.kmeanspp_seeding($1, $2, $3, $4),
<a name="l00664"></a>00664 $4, $5, $6, $7);
<a name="l00665"></a>00665 RETURN ret;
<a name="l00666"></a>00666 END
<a name="l00667"></a>00667 $$;
<a name="l00668"></a><a class="code" href="kmeans_8sql__in.html#ac6c26c8e6b4643acfa79a87bd3ab0fe4">00668</a>
<a name="l00669"></a>00669 CREATE FUNCTION MADLIB_SCHEMA.<a class="code" href="kmeans_8sql__in.html#ac6c26c8e6b4643acfa79a87bd3ab0fe4" title="Run k-Means++.">kmeanspp</a>(
<a name="l00670"></a>00670 rel_source VARCHAR,
<a name="l00671"></a>00671 expr_point VARCHAR,
<a name="l00672"></a>00672 k INTEGER,
<a name="l00673"></a>00673 fn_dist VARCHAR,
<a name="l00674"></a>00674 agg_centroid VARCHAR,
<a name="l00675"></a>00675 max_num_iterations INTEGER
<a name="l00676"></a>00676 ) RETURNS MADLIB_SCHEMA.kmeans_result
<a name="l00677"></a>00677 VOLATILE
<a name="l00678"></a>00678 STRICT
<a name="l00679"></a>00679 LANGUAGE plpgsql
<a name="l00680"></a>00680 AS $$
<a name="l00681"></a>00681 DECLARE
<a name="l00682"></a>00682 ret MADLIB_SCHEMA.kmeans_result;
<a name="l00683"></a>00683 BEGIN
<a name="l00684"></a>00684 ret = MADLIB_SCHEMA.kmeans(
<a name="l00685"></a>00685 $1, $2, MADLIB_SCHEMA.kmeanspp_seeding($1, $2, $3, $4),
<a name="l00686"></a>00686 $4, $5, $6, 0.001);
<a name="l00687"></a>00687 RETURN ret;
<a name="l00688"></a>00688 END
<a name="l00689"></a>00689 $$;
<a name="l00690"></a>00690
<a name="l00691"></a>00691 CREATE FUNCTION MADLIB_SCHEMA.<a class="code" href="kmeans_8sql__in.html#ac6c26c8e6b4643acfa79a87bd3ab0fe4" title="Run k-Means++.">kmeanspp</a>(
<a name="l00692"></a>00692 rel_source VARCHAR,
<a name="l00693"></a>00693 expr_point VARCHAR,
<a name="l00694"></a>00694 k INTEGER,
<a name="l00695"></a>00695 fn_dist VARCHAR,
<a name="l00696"></a>00696 agg_centroid VARCHAR
<a name="l00697"></a>00697 ) RETURNS MADLIB_SCHEMA.kmeans_result
<a name="l00698"></a>00698 VOLATILE
<a name="l00699"></a>00699 STRICT
<a name="l00700"></a>00700 LANGUAGE plpgsql
<a name="l00701"></a>00701 AS $$
<a name="l00702"></a>00702 DECLARE
<a name="l00703"></a>00703 ret MADLIB_SCHEMA.kmeans_result;
<a name="l00704"></a>00704 BEGIN
<a name="l00705"></a>00705 ret = MADLIB_SCHEMA.kmeans(
<a name="l00706"></a>00706 $1, $2, MADLIB_SCHEMA.kmeanspp_seeding($1, $2, $3, $4),
<a name="l00707"></a>00707 $4, $5, 20, 0.001);
<a name="l00708"></a>00708 RETURN ret;
<a name="l00709"></a>00709 END
<a name="l00710"></a>00710 $$;
<a name="l00711"></a>00711
<a name="l00712"></a>00712 CREATE FUNCTION MADLIB_SCHEMA.<a class="code" href="kmeans_8sql__in.html#ac6c26c8e6b4643acfa79a87bd3ab0fe4" title="Run k-Means++.">kmeanspp</a>(
<a name="l00713"></a>00713 rel_source VARCHAR,
<a name="l00714"></a>00714 expr_point VARCHAR,
<a name="l00715"></a>00715 k INTEGER,
<a name="l00716"></a>00716 fn_dist VARCHAR
<a name="l00717"></a>00717 ) RETURNS MADLIB_SCHEMA.kmeans_result
<a name="l00718"></a>00718 VOLATILE
<a name="l00719"></a>00719 STRICT
<a name="l00720"></a>00720 LANGUAGE plpgsql
<a name="l00721"></a>00721 AS $$
<a name="l00722"></a>00722 DECLARE
<a name="l00723"></a>00723 ret MADLIB_SCHEMA.kmeans_result;
<a name="l00724"></a>00724 BEGIN
<a name="l00725"></a>00725 ret = MADLIB_SCHEMA.kmeans(
<a name="l00726"></a>00726 $1, $2, MADLIB_SCHEMA.kmeanspp_seeding($1, $2, $3, $4),
<a name="l00727"></a>00727 $4, &#39;MADLIB_SCHEMA.avg&#39;, 20, 0.001);
<a name="l00728"></a>00728 RETURN ret;
<a name="l00729"></a>00729 END
<a name="l00730"></a>00730 $$;
<a name="l00731"></a>00731
<a name="l00732"></a>00732 CREATE FUNCTION MADLIB_SCHEMA.<a class="code" href="kmeans_8sql__in.html#ac6c26c8e6b4643acfa79a87bd3ab0fe4" title="Run k-Means++.">kmeanspp</a>(
<a name="l00733"></a>00733 rel_source VARCHAR,
<a name="l00734"></a>00734 expr_point VARCHAR,
<a name="l00735"></a>00735 k INTEGER
<a name="l00736"></a>00736 ) RETURNS MADLIB_SCHEMA.kmeans_result
<a name="l00737"></a>00737 VOLATILE
<a name="l00738"></a>00738 STRICT
<a name="l00739"></a>00739 LANGUAGE plpgsql
<a name="l00740"></a>00740 AS $$
<a name="l00741"></a>00741 DECLARE
<a name="l00742"></a>00742 ret MADLIB_SCHEMA.kmeans_result;
<a name="l00743"></a>00743 BEGIN
<a name="l00744"></a>00744 ret = MADLIB_SCHEMA.kmeans(
<a name="l00745"></a>00745 $1, $2,
<a name="l00746"></a>00746 MADLIB_SCHEMA.kmeanspp_seeding($1, $2, $3,
<a name="l00747"></a>00747 &#39;MADLIB_SCHEMA.squared_dist_norm2&#39;),
<a name="l00748"></a>00748 &#39;MADLIB_SCHEMA.squared_dist_norm2&#39;, &#39;MADLIB_SCHEMA.avg&#39;, 20, 0.001);
<a name="l00749"></a>00749 RETURN ret;
<a name="l00750"></a>00750 END
<a name="l00751"></a>00751 $$;
<a name="l00752"></a>00752
<a name="l00753"></a>00753 CREATE FUNCTION MADLIB_SCHEMA.internal_execute_using_kmeans_random_seeding_args(
<a name="l00754"></a>00754 sql VARCHAR, INTEGER, DOUBLE PRECISION[][]
<a name="l00755"></a>00755 ) RETURNS VOID
<a name="l00756"></a>00756 VOLATILE
<a name="l00757"></a>00757 CALLED ON NULL INPUT
<a name="l00758"></a>00758 LANGUAGE c
<a name="l00759"></a>00759 AS &#39;MODULE_PATHNAME&#39;, &#39;exec_sql_using&#39;;
<a name="l00760"></a>00760
<a name="l00761"></a>00761 CREATE FUNCTION MADLIB_SCHEMA.internal_compute_kmeans_random_seeding(
<a name="l00762"></a>00762 rel_args VARCHAR,
<a name="l00763"></a>00763 rel_state VARCHAR,
<a name="l00764"></a>00764 rel_source VARCHAR,
<a name="l00765"></a>00765 expr_point VARCHAR)
<a name="l00766"></a>00766 RETURNS INTEGER
<a name="l00767"></a>00767 AS $$PythonFunction(kmeans, kmeans, compute_kmeans_random_seeding)$$
<a name="l00768"></a>00768 LANGUAGE plpythonu VOLATILE;
<a name="l00769"></a>00769 <span class="comment"></span>
<a name="l00770"></a>00770 <span class="comment">/**</span>
<a name="l00771"></a>00771 <span class="comment"> * @brief k-Means Random Seeding</span>
<a name="l00772"></a>00772 <span class="comment"> *</span>
<a name="l00773"></a>00773 <span class="comment"> * @param rel_source Name of the relation containing input points</span>
<a name="l00774"></a>00774 <span class="comment"> * @param expr_point Expression evaluating to point coordinates for each tuple</span>
<a name="l00775"></a>00775 <span class="comment"> * @param k Number of centroids</span>
<a name="l00776"></a>00776 <span class="comment"> * @param initial_centroids A matrix containing up to \f$ k \f$ columns as</span>
<a name="l00777"></a>00777 <span class="comment"> * columns. kmeanspp_seeding() proceeds exactly as if these centroids had</span>
<a name="l00778"></a>00778 <span class="comment"> * already been generated in previous iterations. This parameter may be</span>
<a name="l00779"></a>00779 <span class="comment"> * NULL in which all \f$ k \f$ centroids will be generated.</span>
<a name="l00780"></a>00780 <span class="comment"> * @returns A matrix containing \f$ k \f$ centroids as columns</span>
<a name="l00781"></a>00781 <span class="comment"> */</span>
<a name="l00782"></a>00782 CREATE FUNCTION MADLIB_SCHEMA.<a class="code" href="kmeans_8sql__in.html#a01e02736e6d156240b15f7d6dae092c3" title="k-Means Random Seeding">kmeans_random_seeding</a>(
<a name="l00783"></a>00783 rel_source VARCHAR,
<a name="l00784"></a>00784 expr_point VARCHAR,
<a name="l00785"></a>00785 k INTEGER,
<a name="l00786"></a>00786 initial_centroids DOUBLE PRECISION[][] <span class="comment">/*+ DEFAULT NULL */</span>
<a name="l00787"></a>00787 ) RETURNS DOUBLE PRECISION[][] AS $$
<a name="l00788"></a>00788 DECLARE
<a name="l00789"></a>00789 theIteration INTEGER;
<a name="l00790"></a>00790 theResult DOUBLE PRECISION[][];
<a name="l00791"></a>00791 oldClientMinMessages VARCHAR;
<a name="l00792"></a>00792 class_rel_source REGCLASS;
<a name="l00793"></a>00793 proc_fn_dist REGPROCEDURE;
<a name="l00794"></a>00794 num_points INTEGER;
<a name="l00795"></a>00795 num_centroids INTEGER;
<a name="l00796"></a>00796 rel_filtered VARCHAR;
<a name="l00797"></a>00797 BEGIN
<a name="l00798"></a>00798 rel_filtered = MADLIB_SCHEMA.__filter_input_relation(rel_source, expr_point);
<a name="l00799"></a>00799 class_rel_source := rel_filtered;
<a name="l00800"></a>00800
<a name="l00801"></a>00801 IF (initial_centroids IS NOT NULL) THEN
<a name="l00802"></a>00802 num_centroids := array_upper(initial_centroids,1);
<a name="l00803"></a>00803 ELSE
<a name="l00804"></a><a class="code" href="kmeans_8sql__in.html#a01e02736e6d156240b15f7d6dae092c3">00804</a> num_centroids := k;
<a name="l00805"></a>00805 END IF;
<a name="l00806"></a>00806
<a name="l00807"></a>00807 IF (k &lt;= 0) THEN
<a name="l00808"></a>00808 RAISE EXCEPTION &#39;Number of clusters k must be a positive integer.&#39;;
<a name="l00809"></a>00809 END IF;
<a name="l00810"></a>00810 IF (k &gt; 32767) THEN
<a name="l00811"></a>00811 RAISE EXCEPTION &#39;Number of clusters k must be &lt;= 32767 (for results to be returned in a reasonable amount of time).&#39;;
<a name="l00812"></a>00812 END IF;
<a name="l00813"></a>00813 EXECUTE $sql$ SELECT count(*) FROM $sql$ || textin(regclassout(class_rel_source)) INTO num_points;
<a name="l00814"></a>00814 IF (num_points &lt; k OR num_points &lt; num_centroids) THEN
<a name="l00815"></a>00815 RAISE EXCEPTION &#39;Number of centroids is greater than number of points.&#39;;
<a name="l00816"></a>00816 END IF;
<a name="l00817"></a>00817 IF (k &lt; num_centroids) THEN
<a name="l00818"></a>00818 RAISE WARNING &#39;Number of clusters k is less than number of supplied initial centroids. Number of final clusters will equal number of supplied initial centroids.&#39;;
<a name="l00819"></a>00819 END IF;
<a name="l00820"></a>00820
<a name="l00821"></a>00821 -- We first setup the argument table. Rationale: We want to avoid all data
<a name="l00822"></a>00822 -- conversion between native types and Python code. Instead, we use Python
<a name="l00823"></a>00823 -- as a pure driver layer.
<a name="l00824"></a>00824 oldClientMinMessages :=
<a name="l00825"></a>00825 (SELECT setting FROM pg_settings WHERE name = &#39;client_min_messages&#39;);
<a name="l00826"></a>00826 EXECUTE &#39;SET client_min_messages TO warning&#39;;
<a name="l00827"></a>00827 PERFORM MADLIB_SCHEMA.create_schema_pg_temp();
<a name="l00828"></a>00828 -- Unfortunately, the EXECUTE USING syntax is only available starting
<a name="l00829"></a>00829 -- PostgreSQL 8.4:
<a name="l00830"></a>00830 -- http:<span class="comment">//www.postgresql.org/docs/8.4/static/plpgsql-statements.html#PLPGSQL-STATEMENTS-EXECUTING-DYN</span>
<a name="l00831"></a>00831 -- We therefore have to emulate.
<a name="l00832"></a>00832 PERFORM MADLIB_SCHEMA.internal_execute_using_kmeans_random_seeding_args($sql$
<a name="l00833"></a>00833 DROP TABLE IF EXISTS pg_temp._madlib_kmeans_random_args;
<a name="l00834"></a>00834 CREATE TEMPORARY TABLE _madlib_kmeans_random_args AS
<a name="l00835"></a>00835 SELECT $1 AS k, $2 AS initial_centroids;
<a name="l00836"></a>00836 $sql$,
<a name="l00837"></a>00837 k, initial_centroids);
<a name="l00838"></a>00838 EXECUTE &#39;SET client_min_messages TO &#39; || oldClientMinMessages;
<a name="l00839"></a>00839
<a name="l00840"></a>00840 -- Perform acutal computation.
<a name="l00841"></a>00841 -- Unfortunately, Greenplum and PostgreSQL &lt;= 8.2 do not have conversion
<a name="l00842"></a>00842 -- operators from regclass to varchar/text.
<a name="l00843"></a>00843 theIteration := (
<a name="l00844"></a>00844 SELECT MADLIB_SCHEMA.internal_compute_kmeans_random_seeding(
<a name="l00845"></a>00845 &#39;_madlib_kmeans_random_args&#39;, &#39;_madlib_kmeans_random_state&#39;,
<a name="l00846"></a>00846 textin(regclassout(class_rel_source)), expr_point)
<a name="l00847"></a>00847 );
<a name="l00848"></a>00848
<a name="l00849"></a>00849 -- Retrieve result from state table and return it
<a name="l00850"></a>00850 EXECUTE
<a name="l00851"></a>00851 $sql$
<a name="l00852"></a>00852 SELECT _state FROM _madlib_kmeans_random_state
<a name="l00853"></a>00853 WHERE _iteration = $sql$ || theIteration || $sql$
<a name="l00854"></a>00854 $sql$
<a name="l00855"></a>00855 INTO theResult;
<a name="l00856"></a>00856 RETURN theResult;
<a name="l00857"></a>00857 END;
<a name="l00858"></a>00858 $$ LANGUAGE plpgsql VOLATILE;
<a name="l00859"></a>00859
<a name="l00860"></a>00860 CREATE FUNCTION MADLIB_SCHEMA.<a class="code" href="kmeans_8sql__in.html#a01e02736e6d156240b15f7d6dae092c3" title="k-Means Random Seeding">kmeans_random_seeding</a>(
<a name="l00861"></a>00861 rel_source VARCHAR,
<a name="l00862"></a>00862 expr_point VARCHAR,
<a name="l00863"></a>00863 k INTEGER
<a name="l00864"></a>00864 ) RETURNS DOUBLE PRECISION[][]
<a name="l00865"></a>00865 LANGUAGE sql AS $$
<a name="l00866"></a>00866 SELECT MADLIB_SCHEMA.<a class="code" href="kmeans_8sql__in.html#a01e02736e6d156240b15f7d6dae092c3" title="k-Means Random Seeding">kmeans_random_seeding</a>($1, $2, $3, NULL)
<a name="l00867"></a>00867 $$;
<a name="l00868"></a>00868 <span class="comment"></span>
<a name="l00869"></a>00869 <span class="comment">/**</span>
<a name="l00870"></a>00870 <span class="comment"> * @brief Run k-Means with random seeding.</span>
<a name="l00871"></a>00871 <span class="comment"> *</span>
<a name="l00872"></a>00872 <span class="comment"> * This is a shortcut for running k-means with random seeding. It is equivalent</span>
<a name="l00873"></a>00873 <span class="comment"> * to</span>
<a name="l00874"></a>00874 <span class="comment"> * &lt;pre&gt;SELECT \ref kmeans(</span>
<a name="l00875"></a>00875 <span class="comment"> rel_source,</span>
<a name="l00876"></a>00876 <span class="comment"> expr_point,</span>
<a name="l00877"></a>00877 <span class="comment"> \ref kmeans_random_seeding(</span>
<a name="l00878"></a>00878 <span class="comment"> rel_source,</span>
<a name="l00879"></a>00879 <span class="comment"> expr_point,</span>
<a name="l00880"></a>00880 <span class="comment"> k</span>
<a name="l00881"></a>00881 <span class="comment"> ),</span>
<a name="l00882"></a>00882 <span class="comment"> fn_dist,</span>
<a name="l00883"></a>00883 <span class="comment"> agg_centroid,</span>
<a name="l00884"></a>00884 <span class="comment"> max_num_iterations,</span>
<a name="l00885"></a>00885 <span class="comment"> min_frac_reassigned</span>
<a name="l00886"></a>00886 <span class="comment">)&lt;/pre&gt;</span>
<a name="l00887"></a>00887 <span class="comment"> */</span>
<a name="l00888"></a>00888 CREATE FUNCTION MADLIB_SCHEMA.<a class="code" href="kmeans_8sql__in.html#aeec5efd06aca50f4830aa10d522dc5ed" title="Run k-Means with random seeding.">kmeans_random</a>(
<a name="l00889"></a>00889 rel_source VARCHAR,
<a name="l00890"></a>00890 expr_point VARCHAR,
<a name="l00891"></a>00891 k INTEGER,
<a name="l00892"></a>00892 fn_dist VARCHAR <span class="comment">/*+ DEFAULT &#39;squared_dist_norm2&#39; */</span>,
<a name="l00893"></a>00893 agg_centroid VARCHAR <span class="comment">/*+ DEFAULT &#39;avg&#39; */</span>,
<a name="l00894"></a>00894 max_num_iterations INTEGER <span class="comment">/*+ DEFAULT 20 */</span>,
<a name="l00895"></a>00895 min_frac_reassigned DOUBLE PRECISION <span class="comment">/*+ DEFAULT 0.001 */</span>
<a name="l00896"></a>00896 ) RETURNS MADLIB_SCHEMA.kmeans_result
<a name="l00897"></a>00897 VOLATILE
<a name="l00898"></a>00898 STRICT
<a name="l00899"></a>00899 LANGUAGE plpgsql
<a name="l00900"></a>00900 AS $$
<a name="l00901"></a>00901 DECLARE
<a name="l00902"></a>00902 ret MADLIB_SCHEMA.kmeans_result;
<a name="l00903"></a>00903 BEGIN
<a name="l00904"></a>00904 ret = MADLIB_SCHEMA.kmeans(
<a name="l00905"></a>00905 $1, $2, MADLIB_SCHEMA.kmeans_random_seeding($1, $2, $3),
<a name="l00906"></a>00906 $4, $5, $6, $7);
<a name="l00907"></a>00907 RETURN ret;
<a name="l00908"></a>00908 END
<a name="l00909"></a>00909 $$;
<a name="l00910"></a><a class="code" href="kmeans_8sql__in.html#aeec5efd06aca50f4830aa10d522dc5ed">00910</a>
<a name="l00911"></a>00911 CREATE FUNCTION MADLIB_SCHEMA.<a class="code" href="kmeans_8sql__in.html#aeec5efd06aca50f4830aa10d522dc5ed" title="Run k-Means with random seeding.">kmeans_random</a>(
<a name="l00912"></a>00912 rel_source VARCHAR,
<a name="l00913"></a>00913 expr_point VARCHAR,
<a name="l00914"></a>00914 k INTEGER,
<a name="l00915"></a>00915 fn_dist VARCHAR,
<a name="l00916"></a>00916 agg_centroid VARCHAR,
<a name="l00917"></a>00917 max_num_iterations INTEGER
<a name="l00918"></a>00918 ) RETURNS MADLIB_SCHEMA.kmeans_result
<a name="l00919"></a>00919 VOLATILE
<a name="l00920"></a>00920 STRICT
<a name="l00921"></a>00921 LANGUAGE plpgsql
<a name="l00922"></a>00922 AS $$
<a name="l00923"></a>00923 DECLARE
<a name="l00924"></a>00924 ret MADLIB_SCHEMA.kmeans_result;
<a name="l00925"></a>00925 BEGIN
<a name="l00926"></a>00926 ret = MADLIB_SCHEMA.kmeans(
<a name="l00927"></a>00927 $1, $2, MADLIB_SCHEMA.kmeans_random_seeding($1, $2, $3),
<a name="l00928"></a>00928 $4, $5, $6, 0.001);
<a name="l00929"></a>00929 RETURN ret;
<a name="l00930"></a>00930 END
<a name="l00931"></a>00931 $$;
<a name="l00932"></a>00932
<a name="l00933"></a>00933 CREATE FUNCTION MADLIB_SCHEMA.<a class="code" href="kmeans_8sql__in.html#aeec5efd06aca50f4830aa10d522dc5ed" title="Run k-Means with random seeding.">kmeans_random</a>(
<a name="l00934"></a>00934 rel_source VARCHAR,
<a name="l00935"></a>00935 expr_point VARCHAR,
<a name="l00936"></a>00936 k INTEGER,
<a name="l00937"></a>00937 fn_dist VARCHAR,
<a name="l00938"></a>00938 agg_centroid VARCHAR
<a name="l00939"></a>00939 ) RETURNS MADLIB_SCHEMA.kmeans_result
<a name="l00940"></a>00940 VOLATILE
<a name="l00941"></a>00941 STRICT
<a name="l00942"></a>00942 LANGUAGE plpgsql
<a name="l00943"></a>00943 AS $$
<a name="l00944"></a>00944 DECLARE
<a name="l00945"></a>00945 ret MADLIB_SCHEMA.kmeans_result;
<a name="l00946"></a>00946 BEGIN
<a name="l00947"></a>00947 ret = MADLIB_SCHEMA.kmeans(
<a name="l00948"></a>00948 $1, $2, MADLIB_SCHEMA.kmeans_random_seeding($1, $2, $3),
<a name="l00949"></a>00949 $4, $5, 20, 0.001);
<a name="l00950"></a>00950 RETURN ret;
<a name="l00951"></a>00951 END
<a name="l00952"></a>00952 $$;
<a name="l00953"></a>00953
<a name="l00954"></a>00954 CREATE FUNCTION MADLIB_SCHEMA.<a class="code" href="kmeans_8sql__in.html#aeec5efd06aca50f4830aa10d522dc5ed" title="Run k-Means with random seeding.">kmeans_random</a>(
<a name="l00955"></a>00955 rel_source VARCHAR,
<a name="l00956"></a>00956 expr_point VARCHAR,
<a name="l00957"></a>00957 k INTEGER,
<a name="l00958"></a>00958 fn_dist VARCHAR
<a name="l00959"></a>00959 ) RETURNS MADLIB_SCHEMA.kmeans_result
<a name="l00960"></a>00960 VOLATILE
<a name="l00961"></a>00961 STRICT
<a name="l00962"></a>00962 LANGUAGE plpgsql
<a name="l00963"></a>00963 AS $$
<a name="l00964"></a>00964 DECLARE
<a name="l00965"></a>00965 ret MADLIB_SCHEMA.kmeans_result;
<a name="l00966"></a>00966 BEGIN
<a name="l00967"></a>00967 ret = MADLIB_SCHEMA.kmeans(
<a name="l00968"></a>00968 $1, $2,
<a name="l00969"></a>00969 MADLIB_SCHEMA.kmeans_random_seeding($1, $2, $3),
<a name="l00970"></a>00970 $4, &#39;MADLIB_SCHEMA.avg&#39;, 20, 0.001);
<a name="l00971"></a>00971 RETURN ret;
<a name="l00972"></a>00972 END
<a name="l00973"></a>00973 $$;
<a name="l00974"></a>00974
<a name="l00975"></a>00975 CREATE FUNCTION MADLIB_SCHEMA.<a class="code" href="kmeans_8sql__in.html#aeec5efd06aca50f4830aa10d522dc5ed" title="Run k-Means with random seeding.">kmeans_random</a>(
<a name="l00976"></a>00976 rel_source VARCHAR,
<a name="l00977"></a>00977 expr_point VARCHAR,
<a name="l00978"></a>00978 k INTEGER
<a name="l00979"></a>00979 ) RETURNS MADLIB_SCHEMA.kmeans_result
<a name="l00980"></a>00980 VOLATILE
<a name="l00981"></a>00981 STRICT
<a name="l00982"></a>00982 LANGUAGE plpgsql
<a name="l00983"></a>00983 AS $$
<a name="l00984"></a>00984 DECLARE
<a name="l00985"></a>00985 ret MADLIB_SCHEMA.kmeans_result;
<a name="l00986"></a>00986 BEGIN
<a name="l00987"></a>00987 ret = MADLIB_SCHEMA.kmeans(
<a name="l00988"></a>00988 $1, $2,
<a name="l00989"></a>00989 MADLIB_SCHEMA.kmeans_random_seeding($1, $2, $3),
<a name="l00990"></a>00990 &#39;MADLIB_SCHEMA.squared_dist_norm2&#39;, &#39;MADLIB_SCHEMA.avg&#39;, 20, 0.001);
<a name="l00991"></a>00991 RETURN ret;
<a name="l00992"></a>00992 END
<a name="l00993"></a>00993 $$;
<a name="l00994"></a>00994 <span class="comment"></span>
<a name="l00995"></a>00995 <span class="comment">/**</span>
<a name="l00996"></a>00996 <span class="comment"> * @internal</span>
<a name="l00997"></a>00997 <span class="comment"> * @brief Execute a SQL command where $1, ..., $6 are substituted with the</span>
<a name="l00998"></a>00998 <span class="comment"> * given arguments.</span>
<a name="l00999"></a>00999 <span class="comment"> */</span>
<a name="l01000"></a>01000 CREATE FUNCTION MADLIB_SCHEMA.internal_execute_using_kmeans_args(
<a name="l01001"></a>01001 sql VARCHAR, rel_source VARCHAR, expr_point VARCHAR,
<a name="l01002"></a>01002 fn_dist VARCHAR, agg_centroid VARCHAR, max_num_iterations INTEGER,
<a name="l01003"></a>01003 min_frac_reassigned DOUBLE PRECISION
<a name="l01004"></a>01004 ) RETURNS MADLIB_SCHEMA.kmeans_result
<a name="l01005"></a>01005 VOLATILE
<a name="l01006"></a>01006 CALLED ON NULL INPUT
<a name="l01007"></a>01007 LANGUAGE c
<a name="l01008"></a>01008 AS &#39;MODULE_PATHNAME&#39;, &#39;exec_sql_using&#39;;
<a name="l01009"></a>01009 <span class="comment"></span>
<a name="l01010"></a>01010 <span class="comment">/**</span>
<a name="l01011"></a>01011 <span class="comment"> * @internal</span>
<a name="l01012"></a>01012 <span class="comment"> * @brief Filter out the invalid data points in the original input relation</span>
<a name="l01013"></a>01013 <span class="comment"> */</span>
<a name="l01014"></a>01014 CREATE FUNCTION MADLIB_SCHEMA.__filter_input_relation(
<a name="l01015"></a>01015 rel_source VARCHAR, expr_point VARCHAR)
<a name="l01016"></a>01016 RETURNS VARCHAR
<a name="l01017"></a>01017 AS $$
<a name="l01018"></a>01018 DECLARE
<a name="l01019"></a>01019 oldClientMinMessages VARCHAR;
<a name="l01020"></a>01020 rel_source_filtered VARCHAR;
<a name="l01021"></a>01021 BEGIN
<a name="l01022"></a>01022 IF (SELECT position(&#39;.&#39; in rel_source)) &gt; 0 THEN
<a name="l01023"></a>01023 rel_source_filtered := &#39;_madlib_&#39; || split_part(rel_source, &#39;.&#39;, 2) || &#39;_filtered&#39;;
<a name="l01024"></a>01024 ELSE
<a name="l01025"></a>01025 rel_source_filtered := &#39;_madlib_&#39; || rel_source || &#39;_filtered&#39;;
<a name="l01026"></a>01026 END IF;
<a name="l01027"></a>01027
<a name="l01028"></a>01028 oldClientMinMessages :=
<a name="l01029"></a>01029 (SELECT setting FROM pg_settings WHERE name = &#39;client_min_messages&#39;);
<a name="l01030"></a>01030 EXECUTE &#39;SET client_min_messages TO warning&#39;;
<a name="l01031"></a>01031 EXECUTE &#39;DROP VIEW IF EXISTS _madlib_&#39;||rel_source_filtered||&#39;_filtered&#39;;
<a name="l01032"></a>01032 EXECUTE &#39;DROP VIEW IF EXISTS &#39;||rel_source_filtered;
<a name="l01033"></a>01033 EXECUTE &#39;CREATE TEMP VIEW &#39;||rel_source_filtered||&#39;
<a name="l01034"></a>01034 AS SELECT * FROM &#39;||rel_source||&#39;
<a name="l01035"></a>01035 WHERE abs(
<a name="l01036"></a>01036 coalesce(
<a name="l01037"></a>01037 MADLIB_SCHEMA.<a class="code" href="svec_8sql__in.html#a00a7b3260b9fde9b55061e6bf58a028a">svec_elsum</a>(&#39;||expr_point||&#39;),
<a name="l01038"></a>01038 &#39;&#39;Infinity&#39;&#39;::FLOAT8
<a name="l01039"></a>01039 )
<a name="l01040"></a>01040 ) &lt; &#39;&#39;Infinity&#39;&#39;::FLOAT8&#39;;
<a name="l01041"></a>01041 EXECUTE &#39;SET client_min_messages TO &#39; || oldClientMinMessages;
<a name="l01042"></a>01042 RETURN rel_source_filtered;
<a name="l01043"></a>01043 EXCEPTION
<a name="l01044"></a>01044 WHEN undefined_function THEN
<a name="l01045"></a>01045 RAISE EXCEPTION &#39;Point coordinates (%) are not a valid type
<a name="l01046"></a>01046 (SVEC, FLOAT[], or INTEGER[]).&#39;, expr_point;
<a name="l01047"></a>01047 END
<a name="l01048"></a>01048 $$
<a name="l01049"></a>01049 VOLATILE STRICT
<a name="l01050"></a>01050 LANGUAGE PLPGSQL;
<a name="l01051"></a>01051 <span class="comment"></span>
<a name="l01052"></a>01052 <span class="comment">/**</span>
<a name="l01053"></a>01053 <span class="comment"> * @brief Perform Lloyd&#39;s k-means local-search heuristic, but with initial</span>
<a name="l01054"></a>01054 <span class="comment"> * centroids stored in a table</span>
<a name="l01055"></a>01055 <span class="comment"> *</span>
<a name="l01056"></a>01056 <span class="comment"> * This is a shortcut for running k-means with initial centroids stored in a</span>
<a name="l01057"></a>01057 <span class="comment"> * table (as opposed to an array of centroids). It is equivalent</span>
<a name="l01058"></a>01058 <span class="comment"> * to</span>
<a name="l01059"></a>01059 <span class="comment"> * &lt;pre&gt;SELECT \ref kmeans(</span>
<a name="l01060"></a>01060 <span class="comment"> rel_source,</span>
<a name="l01061"></a>01061 <span class="comment"> expr_point,</span>
<a name="l01062"></a>01062 <span class="comment"> (SELECT \ref matrix_agg($expr_centroid) FROM $rel_initial_centroids),</span>
<a name="l01063"></a>01063 <span class="comment"> fn_dist,</span>
<a name="l01064"></a>01064 <span class="comment"> agg_centroid,</span>
<a name="l01065"></a>01065 <span class="comment"> max_num_iterations,</span>
<a name="l01066"></a>01066 <span class="comment"> min_frac_reassigned</span>
<a name="l01067"></a>01067 <span class="comment">)&lt;/pre&gt;</span>
<a name="l01068"></a>01068 <span class="comment"> * where &lt;tt&gt;$expr_centroid&lt;/tt&gt; and &lt;tt&gt;$rel_initial_centroids&lt;/tt&gt; denote</span>
<a name="l01069"></a>01069 <span class="comment"> * textual substituions.</span>
<a name="l01070"></a>01070 <span class="comment"> */</span>
<a name="l01071"></a>01071 CREATE FUNCTION MADLIB_SCHEMA.kmeans(
<a name="l01072"></a>01072 rel_source VARCHAR,
<a name="l01073"></a>01073 expr_point VARCHAR,
<a name="l01074"></a>01074 rel_initial_centroids VARCHAR,
<a name="l01075"></a>01075 expr_centroid VARCHAR,
<a name="l01076"></a>01076 fn_dist VARCHAR <span class="comment">/*+ DEFAULT &#39;squared_dist_norm2&#39; */</span>,
<a name="l01077"></a>01077 agg_centroid VARCHAR <span class="comment">/*+ DEFAULT &#39;avg&#39; */</span>,
<a name="l01078"></a>01078 max_num_iterations INTEGER <span class="comment">/*+ DEFAULT 20 */</span>,
<a name="l01079"></a>01079 min_frac_reassigned DOUBLE PRECISION <span class="comment">/*+ DEFAULT 0.001 */</span>
<a name="l01080"></a>01080 ) RETURNS MADLIB_SCHEMA.kmeans_result
<a name="l01081"></a>01081 VOLATILE
<a name="l01082"></a>01082 STRICT
<a name="l01083"></a>01083 LANGUAGE plpgsql
<a name="l01084"></a>01084 AS $$
<a name="l01085"></a>01085 DECLARE
<a name="l01086"></a>01086 class_rel_initial_centroids REGCLASS;
<a name="l01087"></a>01087 theResult MADLIB_SCHEMA.kmeans_result;
<a name="l01088"></a>01088 BEGIN
<a name="l01089"></a>01089 class_rel_initial_centroids := rel_initial_centroids;
<a name="l01090"></a>01090 SELECT * FROM MADLIB_SCHEMA.internal_execute_using_kmeans_args($sql$
<a name="l01091"></a>01091 SELECT MADLIB_SCHEMA.kmeans(
<a name="l01092"></a>01092 $1, $2,
<a name="l01093"></a><a class="code" href="kmeans_8sql__in.html#a6e1a47f006bc0576f56eabcd6903086f">01093</a> (
<a name="l01094"></a>01094 SELECT MADLIB_SCHEMA.<a class="code" href="linalg_8sql__in.html#a9c439706f35d6cac89f151d553a5f111" title="Combine vectors to a matrix.">matrix_agg</a>(($sql$ || expr_centroid || $sql$)::FLOAT8[])
<a name="l01095"></a>01095 FROM $sql$ || textin(regclassout(class_rel_initial_centroids))
<a name="l01096"></a>01096 || $sql$
<a name="l01097"></a>01097 ),
<a name="l01098"></a>01098 $3, $4, $5, $6)
<a name="l01099"></a>01099 $sql$,
<a name="l01100"></a>01100 rel_source, expr_point,
<a name="l01101"></a>01101 fn_dist, agg_centroid, max_num_iterations, min_frac_reassigned)
<a name="l01102"></a>01102 INTO theResult;
<a name="l01103"></a>01103 RETURN theResult;
<a name="l01104"></a>01104 END;
<a name="l01105"></a>01105 $$;
<a name="l01106"></a>01106
<a name="l01107"></a>01107 CREATE FUNCTION MADLIB_SCHEMA.kmeans(
<a name="l01108"></a>01108 rel_source VARCHAR,
<a name="l01109"></a>01109 expr_point VARCHAR,
<a name="l01110"></a>01110 rel_initial_centroids VARCHAR,
<a name="l01111"></a>01111 expr_centroid VARCHAR,
<a name="l01112"></a>01112 fn_dist VARCHAR,
<a name="l01113"></a>01113 agg_centroid VARCHAR,
<a name="l01114"></a>01114 max_num_iterations INTEGER
<a name="l01115"></a>01115 ) RETURNS MADLIB_SCHEMA.kmeans_result
<a name="l01116"></a>01116 VOLATILE
<a name="l01117"></a>01117 STRICT
<a name="l01118"></a>01118 LANGUAGE sql AS $$
<a name="l01119"></a>01119 SELECT MADLIB_SCHEMA.kmeans(
<a name="l01120"></a>01120 $1, $2,
<a name="l01121"></a>01121 $3, $4, $5, $6, $7, 0.001)
<a name="l01122"></a>01122 $$;
<a name="l01123"></a>01123
<a name="l01124"></a>01124 CREATE FUNCTION MADLIB_SCHEMA.kmeans(
<a name="l01125"></a>01125 rel_source VARCHAR,
<a name="l01126"></a>01126 expr_point VARCHAR,
<a name="l01127"></a>01127 rel_initial_centroids VARCHAR,
<a name="l01128"></a>01128 expr_centroid VARCHAR,
<a name="l01129"></a>01129 fn_dist VARCHAR,
<a name="l01130"></a>01130 agg_centroid VARCHAR
<a name="l01131"></a>01131 ) RETURNS MADLIB_SCHEMA.kmeans_result
<a name="l01132"></a>01132 VOLATILE
<a name="l01133"></a>01133 STRICT
<a name="l01134"></a>01134 LANGUAGE sql AS $$
<a name="l01135"></a>01135 SELECT MADLIB_SCHEMA.kmeans(
<a name="l01136"></a>01136 $1, $2,
<a name="l01137"></a>01137 $3, $4, $5, $6, 20, 0.001)
<a name="l01138"></a>01138 $$;
<a name="l01139"></a>01139
<a name="l01140"></a>01140 CREATE FUNCTION MADLIB_SCHEMA.kmeans(
<a name="l01141"></a>01141 rel_source VARCHAR,
<a name="l01142"></a>01142 expr_point VARCHAR,
<a name="l01143"></a>01143 rel_initial_centroids VARCHAR,
<a name="l01144"></a>01144 expr_centroid VARCHAR,
<a name="l01145"></a>01145 fn_dist VARCHAR
<a name="l01146"></a>01146 ) RETURNS MADLIB_SCHEMA.kmeans_result
<a name="l01147"></a>01147 VOLATILE
<a name="l01148"></a>01148 STRICT
<a name="l01149"></a>01149 LANGUAGE sql AS $$
<a name="l01150"></a>01150 SELECT MADLIB_SCHEMA.kmeans(
<a name="l01151"></a>01151 $1, $2,
<a name="l01152"></a>01152 $3, $4, $5, &#39;MADLIB_SCHEMA.avg&#39;, 20, 0.001)
<a name="l01153"></a>01153 $$;
<a name="l01154"></a>01154
<a name="l01155"></a>01155 CREATE FUNCTION MADLIB_SCHEMA.kmeans(
<a name="l01156"></a>01156 rel_source VARCHAR,
<a name="l01157"></a>01157 expr_point VARCHAR,
<a name="l01158"></a>01158 rel_initial_centroids VARCHAR,
<a name="l01159"></a>01159 expr_centroid VARCHAR
<a name="l01160"></a>01160 ) RETURNS MADLIB_SCHEMA.kmeans_result
<a name="l01161"></a>01161 VOLATILE
<a name="l01162"></a>01162 STRICT
<a name="l01163"></a>01163 LANGUAGE sql AS $$
<a name="l01164"></a>01164 SELECT MADLIB_SCHEMA.kmeans(
<a name="l01165"></a>01165 $1, $2,
<a name="l01166"></a>01166 $3, $4,
<a name="l01167"></a>01167 &#39;MADLIB_SCHEMA.squared_dist_norm2&#39;, &#39;MADLIB_SCHEMA.avg&#39;, 20, 0.001)
<a name="l01168"></a>01168 $$;
<a name="l01169"></a>01169
<a name="l01170"></a>01170 <span class="comment"></span>
<a name="l01171"></a>01171 <span class="comment">/**</span>
<a name="l01172"></a>01172 <span class="comment"> * @internal</span>
<a name="l01173"></a>01173 <span class="comment"> * @brief Execute a SQL command where $1, ..., $3 are substituted with the</span>
<a name="l01174"></a>01174 <span class="comment"> * given arguments.</span>
<a name="l01175"></a>01175 <span class="comment"> */</span>
<a name="l01176"></a>01176 CREATE FUNCTION MADLIB_SCHEMA.internal_execute_using_silhouette_args(
<a name="l01177"></a>01177 sql VARCHAR, centroids DOUBLE PRECISION[][], fn_dist REGPROC
<a name="l01178"></a>01178 ) RETURNS DOUBLE PRECISION
<a name="l01179"></a>01179 STABLE
<a name="l01180"></a>01180 CALLED ON NULL INPUT
<a name="l01181"></a>01181 LANGUAGE c
<a name="l01182"></a>01182 AS &#39;MODULE_PATHNAME&#39;, &#39;exec_sql_using&#39;;
<a name="l01183"></a>01183 <span class="comment"></span>
<a name="l01184"></a>01184 <span class="comment">/**</span>
<a name="l01185"></a>01185 <span class="comment"> * @brief Compute a simplified version of the silhouette coefficient</span>
<a name="l01186"></a>01186 <span class="comment"> *</span>
<a name="l01187"></a>01187 <span class="comment"> * @param rel_source Name of the relation containing input points</span>
<a name="l01188"></a>01188 <span class="comment"> * @param expr_point Expression evaluating to point coordinates \f$ x_i \f$ for</span>
<a name="l01189"></a>01189 <span class="comment"> * each tuple</span>
<a name="l01190"></a>01190 <span class="comment"> * @param centroids Matrix \f$ M = (\vec{m_0} \dots \vec{m_{k-1}})</span>
<a name="l01191"></a>01191 <span class="comment"> * \in \mathbb{R}^{d \times k} \f$ with \f$ k \f$ columns, where column</span>
<a name="l01192"></a>01192 <span class="comment"> * \f$ i \f$ contains the position of centroid \f$ i \f$.</span>
<a name="l01193"></a>01193 <span class="comment"> * @param fn_dist Name of a function with signature</span>
<a name="l01194"></a>01194 <span class="comment"> * &lt;tt&gt;DOUBLE PRECISION[] x DOUBLE PRECISION[] -&gt; DOUBLE PRECISION&lt;/tt&gt; that</span>
<a name="l01195"></a>01195 <span class="comment"> * returns the distance between two points</span>
<a name="l01196"></a>01196 <span class="comment"> * @return For each point \f$ x_i \f$, let</span>
<a name="l01197"></a>01197 <span class="comment"> * \f$ d_1( x_i ) \f$ and \f$ d_2( x_i ) \f$ be the distance to the closest</span>
<a name="l01198"></a>01198 <span class="comment"> * and 2nd-closest centroid, respectively. If there is more than one</span>
<a name="l01199"></a>01199 <span class="comment"> * closest centroids then \f$ d_1( x_i ) = d_2( x_i )\f$.</span>
<a name="l01200"></a>01200 <span class="comment"> * The return value is the average, over all points \f$ x_i \f$, of</span>
<a name="l01201"></a>01201 <span class="comment"> * \f[</span>
<a name="l01202"></a>01202 <span class="comment"> * \frac{d_2( x_i ) - d_1(x_i)}{d_2(x_i)},</span>
<a name="l01203"></a>01203 <span class="comment"> * \f]</span>
<a name="l01204"></a>01204 <span class="comment"> * where 0/0 is interpreted as 0.</span>
<a name="l01205"></a>01205 <span class="comment"> * Clearly, the simplified silhouette coefficient assumes values in</span>
<a name="l01206"></a>01206 <span class="comment"> * \f$ [0,1] \f$.</span>
<a name="l01207"></a>01207 <span class="comment"> */</span>
<a name="l01208"></a>01208 CREATE FUNCTION MADLIB_SCHEMA.<a class="code" href="kmeans_8sql__in.html#a71e7675758c99acbe7785819b6a85a8f" title="Compute a simplified version of the silhouette coefficient.">simple_silhouette</a>(
<a name="l01209"></a>01209 rel_source VARCHAR,
<a name="l01210"></a>01210 expr_point VARCHAR,
<a name="l01211"></a>01211 centroids DOUBLE PRECISION[][],
<a name="l01212"></a>01212 fn_dist VARCHAR <span class="comment">/*+ DEFAULT &#39;dist_norm2&#39; */</span>
<a name="l01213"></a>01213 ) RETURNS DOUBLE PRECISION
<a name="l01214"></a>01214 STABLE
<a name="l01215"></a>01215 STRICT
<a name="l01216"></a>01216 LANGUAGE plpgsql
<a name="l01217"></a>01217 AS $$
<a name="l01218"></a>01218 DECLARE
<a name="l01219"></a>01219 class_rel_source REGCLASS;
<a name="l01220"></a>01220 proc_fn_dist REGPROCEDURE;
<a name="l01221"></a>01221 rel_filtered VARCHAR;
<a name="l01222"></a>01222 BEGIN
<a name="l01223"></a>01223 IF (array_upper(centroids,1) IS NULL) THEN
<a name="l01224"></a>01224 RAISE EXCEPTION &#39;No valid centroids given.&#39;;
<a name="l01225"></a>01225 END IF;
<a name="l01226"></a>01226
<a name="l01227"></a>01227 rel_filtered = MADLIB_SCHEMA.__filter_input_relation(rel_source, expr_point);
<a name="l01228"></a>01228 class_rel_source := rel_filtered;
<a name="l01229"></a>01229 proc_fn_dist := fn_dist
<a name="l01230"></a><a class="code" href="kmeans_8sql__in.html#a71e7675758c99acbe7785819b6a85a8f">01230</a> || &#39;(DOUBLE PRECISION[], DOUBLE PRECISION[])&#39;;
<a name="l01231"></a>01231 IF (SELECT prorettype != &#39;DOUBLE PRECISION&#39;::regtype OR proisagg = TRUE
<a name="l01232"></a>01232 FROM pg_proc WHERE oid = proc_fn_dist) THEN
<a name="l01233"></a>01233 RAISE EXCEPTION &#39;Distance function has wrong signature or is not a simple function.&#39;;
<a name="l01234"></a>01234 END IF;
<a name="l01235"></a>01235
<a name="l01236"></a>01236 RETURN MADLIB_SCHEMA.internal_execute_using_silhouette_args($sql$
<a name="l01237"></a>01237 SELECT
<a name="l01238"></a>01238 avg(CASE
<a name="l01239"></a>01239 WHEN distances[2] = 0 THEN 0
<a name="l01240"></a>01240 ELSE (distances[2] - distances[1]) / distances[2]
<a name="l01241"></a>01241 END)
<a name="l01242"></a>01242 FROM (
<a name="l01243"></a>01243 SELECT
<a name="l01244"></a>01244 (MADLIB_SCHEMA.<a class="code" href="linalg_8sql__in.html#ad3e16fc5435474b96b182ba20905461e" title="Given matrix and vector compute the columns of that are closest to .">closest_columns</a>(
<a name="l01245"></a>01245 $1,
<a name="l01246"></a>01246 ($sql$ || expr_point || $sql$)::FLOAT8[],
<a name="l01247"></a>01247 2::INTEGER,
<a name="l01248"></a>01248 $2
<a name="l01249"></a>01249 )).distances
<a name="l01250"></a>01250 FROM
<a name="l01251"></a>01251 $sql$ || textin(regclassout(class_rel_source)) || $sql$
<a name="l01252"></a>01252 ) AS two_shortest_distances
<a name="l01253"></a>01253 $sql$,
<a name="l01254"></a>01254 centroids, proc_fn_dist);
<a name="l01255"></a>01255 END;
<a name="l01256"></a>01256 $$;
<a name="l01257"></a>01257
<a name="l01258"></a>01258 CREATE FUNCTION MADLIB_SCHEMA.<a class="code" href="kmeans_8sql__in.html#a71e7675758c99acbe7785819b6a85a8f" title="Compute a simplified version of the silhouette coefficient.">simple_silhouette</a>(
<a name="l01259"></a>01259 rel_source VARCHAR,
<a name="l01260"></a>01260 expr_point VARCHAR,
<a name="l01261"></a>01261 centroids DOUBLE PRECISION[][]
<a name="l01262"></a>01262 ) RETURNS DOUBLE PRECISION
<a name="l01263"></a>01263 STABLE
<a name="l01264"></a>01264 STRICT
<a name="l01265"></a>01265 LANGUAGE sql
<a name="l01266"></a>01266 AS $$
<a name="l01267"></a>01267 SELECT MADLIB_SCHEMA.<a class="code" href="kmeans_8sql__in.html#a71e7675758c99acbe7785819b6a85a8f" title="Compute a simplified version of the silhouette coefficient.">simple_silhouette</a>($1, $2, $3,
<a name="l01268"></a>01268 &#39;MADLIB_SCHEMA.<a class="code" href="linalg_8sql__in.html#aa58e51526edea6ea98db30b6f250adb4" title="2-norm of the difference between two vectors">dist_norm2</a>&#39;)
<a name="l01269"></a>01269 $$;
</pre></div></div>
</div>
<div id="nav-path" class="navpath">
<ul>
<li class="navelem"><a class="el" href="kmeans_8sql__in.html">kmeans.sql_in</a> </li>
<!-- window showing the filter options -->
<div id="MSearchSelectWindow"
onmouseover="return searchBox.OnSearchSelectShow()"
onmouseout="return searchBox.OnSearchSelectHide()"
onkeydown="return searchBox.OnSearchSelectKey(event)">
<a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(0)"><span class="SelectionMark">&#160;</span>All</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(1)"><span class="SelectionMark">&#160;</span>Files</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(2)"><span class="SelectionMark">&#160;</span>Functions</a></div>
<!-- iframe showing the search results (closed by default) -->
<div id="MSearchResultsWindow">
<iframe src="javascript:void(0)" frameborder="0"
name="MSearchResults" id="MSearchResults">
</iframe>
</div>
<li class="footer">Generated on Fri May 10 2013 01:37:13 for MADlib by
<a href="http://www.doxygen.org/index.html">
<img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.7.5.1 </li>
</ul>
</div>
</body>
</html>