blob: 30031488627c29961842a19c0ab46348faed8241 [file] [log] [blame]
/* ----------------------------------------------------------------------- *//**
*
* @file kmeans.sql_in
*
* @brief Set of functions for k-means clustering.
*
* @sa For a brief introduction to k-means clustering, see the module
* description \ref grp_kmeans.
*
*//* ----------------------------------------------------------------------- */
m4_include(`SQLCommon.m4')
/**
@addtogroup grp_kmeans
<div class="toc"><b>Contents</b>
<ul>
<li class="level1"><a href="#cluster">Clustering Function</a></li>
<li class="level1"><a href="#auto_cluster">Auto Clustering Function</a></li>
<li class="level1"><a href="#assignment">Cluster Assignment</a></li>
<li class="level1"><a href="#silh">Simple Silhouette</a></li>
<li class="level1"><a href="#examples">Examples</a></li>
<li class="level1"><a href="#background">Technical Background</a></li>
<li class="level1"><a href="#literature">Literature</a></li>
<li class="level1"><a href="#related">Related Topics</a></li>
</ul>
</div>
@brief Partitions a set of observations into clusters by finding centroids that
minimize the sum of observations' distances from their closest centroid.
Clustering refers to the problem of partitioning a set of objects according to
some problem-dependent measure of <em>similarity</em>. In the k-means variant,
given \f$ n \f$ points \f$ x_1, \dots, x_n \in \mathbb R^d \f$, the
goal is to position \f$ k \f$ centroids \f$ c_1, \dots, c_k \in \mathbb R^d \f$
so that the sum of \em distances between each point and its closest centroid is
minimized. Each centroid represents a cluster that consists of all points to
which this centroid is closest.
This module can compute clusters given the number of centroids <em>k</em> as an input,
using a variety of seeding methods.
It can also automatically select the best <em>k</em> value from a range
of suggested <em>k</em> values, using the simplified silhouette method
or the elbow method.
@anchor cluster
@par Clustering Function
The k-means algorithm can be invoked in different ways, depending on the
source of the initial set of centroids:
- Uniform random centroid seeding method.
<pre class="syntax">
kmeans_random( rel_source,
expr_point,
k,
fn_dist,
agg_centroid,
max_num_iterations,
min_frac_reassigned
)
</pre>
- k-means++ centroid seeding method <a href="#kmeans-lit-1">[1]</a>.
This method can speed up convergence
by seeding centroids spread out over the whole range of the input
points, while at the same time not being too susceptible to outliers.
<pre class="syntax">
kmeanspp( rel_source,
expr_point,
k,
fn_dist,
agg_centroid,
max_num_iterations,
min_frac_reassigned,
seeding_sample_ratio
)
</pre>
- Supply an initial centroid set in a relation identified
by the \e rel_initial_centroids argument, for the case
where initial centroids are stored in a table.
<pre class="syntax">
kmeans( rel_source,
expr_point,
rel_initial_centroids,
expr_centroid,
fn_dist,
agg_centroid,
max_num_iterations,
min_frac_reassigned
)
</pre>
- Provide an initial centroid set as an array expression
in the \e initial_centroids argument.
<pre class="syntax">
kmeans( rel_source,
expr_point,
initial_centroids,
fn_dist,
agg_centroid,
max_num_iterations,
min_frac_reassigned
)
</pre>
\b Arguments
<dl class="arglist">
<dt>rel_source</dt>
<dd>TEXT. The name of the table containing the input data points. Data points and
predefined centroids (if used) are expected to be stored row-wise,
in a column of type <tt>\ref grp_svec "SVEC"</tt> (or any type convertible to
<tt>\ref grp_svec "SVEC"</tt>, like <tt>FLOAT[]</tt> or <tt>INTEGER[]</tt>).
Data points with non-finite values (NULL, NaN, infinity) in any component
are skipped during analysis.
</dd>
<dt>expr_point</dt>
<dd>TEXT. The name of the column with point coordinates or an array expression.</dd>
<dt>k</dt>
<dd>INTEGER. The number of centroids to calculate.</dd>
<dt>fn_dist (optional)</dt>
<dd>TEXT, default: 'squared_dist_norm2'. The name of the function
to use to calculate the distance from a data point vector to a centroid vector.
The following distance functions can be used (computation of barycenter/mean in parentheses):
<ul>
<li><b>\ref dist_norm1</b>: 1-norm/Manhattan (element-wise median).
MADlib does not provide a median aggregate function for
performance reasons.</li>
<li><b>\ref dist_norm2</b>: 2-norm/Euclidean (element-wise mean)</li>
<li><b>\ref squared_dist_norm2</b>: squared Euclidean distance (element-wise mean)</li>
<li><b>\ref dist_angle</b>: angle (element-wise mean of normalized points)</li>
<li><b>\ref dist_tanimoto</b>: tanimoto (element-wise mean of normalized points <a href="#kmeans-lit-5">[5]</a>)</li>
<li><b>user defined function</b> with signature <tt>DOUBLE PRECISION[] x, DOUBLE PRECISION[] y -> DOUBLE PRECISION</tt></li></ul></dd>
<dt>agg_centroid (optional)</dt>
<dd>TEXT, default: 'avg'. The name of the aggregate function used to determine centroids.
The following aggregate functions can be used:<ul>
<li><b>\ref avg</b>: average (default)</li>
<li><b>\ref normalized_avg</b>: normalized average</li></ul></dd>
<dt>max_num_iterations (optional)</dt>
<dd>INTEGER, default: 20. The maximum number of iterations to perform.</dd>
<dt>min_frac_reassigned (optional)</dt>
<dd>DOUBLE PRECISION, default: 0.001. The minimum fraction of centroids reassigned to continue iterating.
When fewer than this fraction of centroids are reassigned in an iteration, the calculation completes.
<dt>seeding_sample_ratio (optional)</dt>
<dd>DOUBLE PRECISION, default: 1.0. The proportion of subsample of original dataset
to use for kmeans++ centroid seeding method. Kmeans++ scans through the data
sequentially 'k' times and can be too slow for big datasets. When
'seeding_sample_ratio' is greater than 0 (thresholded to be maximum value of 1.0),
the seeding is run on a uniform random subsample of the data.
Note: the final K-means algorithm is run on the complete dataset. This parameter
only builds a subsample for the seeding and is only available for kmeans++.
<dt>rel_initial_centroids</dt>
<dd>TEXT. Table or view containing the set of initial centroids.
</dd>
<dt>expr_centroid</dt>
<dd>TEXT. The name of the column (or the array expression)
in the <em>rel_initial_centroids</em> table or view that contains
the centroid coordinates.</dd>
<dt>initial_centroids</dt>
<dd>DOUBLE PRECISION[][]. Array expression
with the initial centroid coordinates.</dd>
</dl>
<b>Output</b>
<br>
The output of the k-means module is a composite type with
the following columns:
<table class="output">
<tr>
<th>centroids</th>
<td>DOUBLE PRECISION[][]. The final centroid positions.</td>
</tr>
<tr>
<th>cluster_variance</th>
<td>DOUBLE PRECISION[]. The value of the objective function per cluster.</td>
</tr>
<tr>
<th>objective_fn</th>
<td>DOUBLE PRECISION. The value of the objective function.</td>
</tr>
<tr>
<th>frac_reassigned</th>
<td>DOUBLE PRECISION. The fraction of points reassigned in the last iteration.</td>
</tr>
<tr>
<th>num_iterations</th>
<td>INTEGER. The total number of iterations executed.</td>
</tr>
</table>
@anchor auto_cluster
@par Auto Clustering Function
The optimal number of centroids can be determined automatically,
from a set of candidate values that you provide, for
random seeding or <em>k-means++</em> seeding. The <a href="#simplified_silhouette">simplified
silhouette method</a> or the <a href="#elbow">elbow method</a> are used to pick the best <em>k</em> value.
- Uniform random centroid seeding method.
<pre class="syntax">
kmeans_random_auto( rel_source,
output_table,
expr_point,
k,
fn_dist,
agg_centroid,
max_num_iterations,
min_frac_reassigned,
k_selection_algorithm
)
</pre>
- k-means++ centroid seeding method.
<pre class="syntax">
kmeanspp_auto( rel_source,
output_table,
expr_point,
k,
fn_dist,
agg_centroid,
max_num_iterations,
min_frac_reassigned,
seeding_sample_ratio,
k_selection_algorithm
)
</pre>
<b>Arguments</b>
<br>
The arguments for auto k selection are the same as described above,
with the following additions:
<dl class="arglist">
<dt>output_table</dt>
<dd>TEXT. Name of the output table containing results for each k
value. Details of the output table are shown below.
A summary table called <output_table>_summary will also be
created for the best k value as per the selection algorithm.</dd>
<dt>k</dt>
<dd>INTEGER[]. Array of k values to test. Does not need to be
contiguous but all elements must be >1 and cannot repeat within the array.
Parameter 'k_selection_algorithm'
determines the evaluation method.</dd>
<dt>k_selection_algorithm (optional)</dt>
<dd>TEXT, default: 'silhouette'. Method to evaluate optimal number of centroids k.
Current approaches supported: 'silhouette', 'elbow' or 'both'.
The text can be any subset of the strings; for e.g., 'silh' will
use the silhouette method.
Note that for large data sets, the silhouette computation
can be expensive.</dd>
@note
Note that the auto k-means algorithms require the NumPy python package to be
installed on the system since the elbow method uses the NumPy gradient function <a href="#kmeans-lit-2">[2]</a>.
For Greenplum clusters, installing NumPy only on the
host machine of the master segment will be sufficient. The suggested
installation method is: <em>pip install numpy --user</em>
</dl>
<b>Output Tables</b>
<br>
Two output tables are created for auto k-means.
The first is called 'output_table' and contains one row
per k value:
<table class="output">
<tr>
<th>k</th>
<td>INTEGER. Number of centroids.</td>
</tr>
<tr>
<th>centroids</th>
<td>DOUBLE PRECISION[][]. The final centroid positions.</td>
</tr>
<tr>
<th>cluster_variance</th>
<td>DOUBLE PRECISION[]. The value of the objective function per cluster.</td>
</tr>
<tr>
<th>objective_fn</th>
<td>DOUBLE PRECISION. The value of the objective function.</td>
</tr>
<tr>
<th>frac_reassigned</th>
<td>DOUBLE PRECISION. The fraction of points reassigned in the last iteration.</td>
</tr>
<tr>
<th>num_iterations</th>
<td>INTEGER. The total number of iterations executed.</td>
</tr>
<tr>
<th>k</th>
<td>INTEGER. Number of centroids as per the
specified 'k_selection_algorithm'. If 'both' is specified,
the best k value will correspond to the silhouette method.</td>
</tr>
<tr>
<th>silhouette</th>
<td>DOUBLE PRECISION. The value of the simplified silhouette
score for the k value, if computed.</td>
</tr>
<tr>
<th>elbow</th>
<td>DOUBLE PRECISION. The value of the elbow
score for the k value, if computed.</td>
</tr>
<tr>
<th>selection_algorithm</th>
<td>TEXT. Algorithm used to select the best
k (either 'silhouette' or 'elbow')</td>
</tr>
</table>
</dl>
A summary table named <output_table>_summary is also created, which has the same output as the
'output_table' above but only contains one row for best k value as per the selection algorithm.
If 'both' is specified for 'k_selection_algorithm'
the best k value returned will correspond to the silhouette method.
@anchor assignment
@par Cluster Assignment
After clustering, the cluster assignment for each data point can be computed with
the help of the closest_column() utility function:
<pre class="syntax">
closest_column( m,
x,
dist
)
</pre>
<b>Arguments</b>
<dl class="arglist">
<dt>m</dt>
<dd>DOUBLE PRECISION[][]. Learned centroids from the training function.</dd>
<dt>x</dt>
<dd>DOUBLE PRECISION[]. Data points.</dd>
<dt>dist (optional)</dt>
<dd>TEXT, default: 'squared_dist_norm2'. The name of the function
to use to calculate the distance from a data point vector to a centroid vector.
See the \e fn_dist argument of the k-means training function for more details on distance functions.</dd>
</dl>
<b>Output</b>
<br>
The output is a composite type with
the following columns:
<table class="output">
<tr>
<th>column_id</th>
<td>INTEGER. Cluster assignment (zero-based, i.e., 0,1,2...).</td>
</tr>
<tr>
<th>distance</th>
<td>DOUBLE PRECISION. Distance to the cluster centroid.</td>
</tr>
</table>
@anchor silh
@par Simple Silhouette
A common method to assess the quality of the clustering is the <em>silhouette
coefficient</em>, a simplified version of which is
implemented in MADlib <a href="#kmeans-lit-3">[3]</a>.
There are two silhouette functions: average score across all data points,
and a per-point score. The average silhouette function has the following syntax:
<pre class="syntax">
simple_silhouette( rel_source,
expr_point,
centroids,
fn_dist
)
</pre>
The per-point silhouette function has two formats. The first
takes an array of centroids:
<pre class="syntax">
simple_silhouette_points(rel_source,
output_table,
pid,
expr_point,
centroids,
fn_dist
)
</pre>
The second takes a centroids column from a table:
<pre class="syntax">
simple_silhouette_points(rel_source,
output_table,
pid,
expr_point,
centroids_table,
centroids_col,
fn_dist
)
</pre>
\b Arguments
<dl class="arglist">
<dt>rel_source</dt>
<dd>TEXT. The name of the table containing the input data points.</dd>
<dt>expr_point</dt>
<dd>TEXT. The name of the column with point coordinates or an array expression.</dd>
<dt>centroids</dt>
<dd>DOUBLE PRECISION[][]. An expression evaluating to an array of centroids. </dd>
<dt>fn_dist (optional)</dt>
<dd>TEXT, default: 'squared_dist_norm2'. The name of the function
to use to calculate the distance from a data point vector to a centroid vector.
See the \e fn_dist argument of the k-means training function for more details on distance functions.</dd>
<dt>output_table</dt>
<dd>TEXT. Name of the output table to contain sihouette score
for each point.</dd>
<dt>pid</dt>
<dd>TEXT. Column containing point ID in the
table 'rel_source' containing the data points. </dd>
<dt>centroids_table</dt>
<dd>TEXT. Name of the table containing the centroids.</dd>
<dt>centroids_col</dt>
<dd>TEXT. Name of the column in the table 'centroids_table'
containing the centroids.</dd>
</dl>
<b>Output</b>
<br>
For the average function, a single value for simple silhouette score is returned.
For the per-point function, the output table contains
the following columns:
<table class="output">
<tr>
<th>pid</th>
<td>INTEGER. Point ID.</td>
</tr>
<tr>
<th>centroid_id</th>
<td>INTEGER. The cluster that the point is assigned to.</td>
</tr>
<tr>
<th>neighbor_centroid_id</th>
<td>INTEGER. The next closest cluster to the one that the point is assigned to.</td>
</tr>
<tr>
<th>simple_silhouette</th>
<td>DOUBLE PRECISION. Simplified silhouette score for the point.</td>
</tr>
</table>
@anchor examples
@examp
@note
Your results may not be exactly the same as below due to the nature random
nature of the k-means algorithm. Also, remember to be consistent in the distance
functions that you use in the k-means, silhouette and helper functions.
<h4>Clustering for Single <em>k</em> Value</h4>
-# Create input data:
<pre class="example">
DROP TABLE IF EXISTS km_sample;
CREATE TABLE km_sample(pid int, points double precision[]);
INSERT INTO km_sample VALUES
(1, '{14.23, 1.71, 2.43, 15.6, 127, 2.8, 3.0600, 0.2800, 2.29, 5.64, 1.04, 3.92, 1065}'),
(2, '{13.2, 1.78, 2.14, 11.2, 1, 2.65, 2.76, 0.26, 1.28, 4.38, 1.05, 3.49, 1050}'),
(3, '{13.16, 2.36, 2.67, 18.6, 101, 2.8, 3.24, 0.3, 2.81, 5.6799, 1.03, 3.17, 1185}'),
(4, '{14.37, 1.95, 2.5, 16.8, 113, 3.85, 3.49, 0.24, 2.18, 7.8, 0.86, 3.45, 1480}'),
(5, '{13.24, 2.59, 2.87, 21, 118, 2.8, 2.69, 0.39, 1.82, 4.32, 1.04, 2.93, 735}'),
(6, '{14.2, 1.76, 2.45, 15.2, 112, 3.27, 3.39, 0.34, 1.97, 6.75, 1.05, 2.85, 1450}'),
(7, '{14.39, 1.87, 2.45, 14.6, 96, 2.5, 2.52, 0.3, 1.98, 5.25, 1.02, 3.58, 1290}'),
(8, '{14.06, 2.15, 2.61, 17.6, 121, 2.6, 2.51, 0.31, 1.25, 5.05, 1.06, 3.58, 1295}'),
(9, '{14.83, 1.64, 2.17, 14, 97, 2.8, 2.98, 0.29, 1.98, 5.2, 1.08, 2.85, 1045}'),
(10, '{13.86, 1.35, 2.27, 16, 98, 2.98, 3.15, 0.22, 1.8500, 7.2199, 1.01, 3.55, 1045}');
</pre>
-# Run k-means clustering using kmeans++ for centroid seeding. Use squared Euclidean
distance which is a commonly used distance function.
<pre class="example">
DROP TABLE IF EXISTS km_result;
CREATE TABLE km_result AS
SELECT * FROM madlib.kmeanspp( 'km_sample', -- Table of source data
'points', -- Column containing point co-ordinates
2, -- Number of centroids to calculate
'madlib.squared_dist_norm2', -- Distance function
'madlib.avg', -- Aggregate function
20, -- Number of iterations
0.001 -- Fraction of centroids reassigned to keep iterating
);
\\x on;
SELECT * FROM km_result;
</pre>
<pre class="result">
-[ RECORD 1 ]----+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
centroids | {{14.255,1.9325,2.5025,16.05,110.5,3.055,2.9775,0.2975,1.845,6.2125,0.9975,3.365,1378.75},{13.7533333333333,1.905,2.425,16.0666666666667,90.3333333333333,2.805,2.98,0.29,2.005,5.40663333333333,1.04166666666667,3.31833333333333,1020.83333333333}}
cluster_variance | {30561.74805,122999.110416013}
objective_fn | 153560.858466013
frac_reassigned | 0
num_iterations | 2
</pre>
-# Simplified silhouette coefficient. First find average
for whole data set. Make sure to use the same distance function as k-means above.
<pre class="example">
SELECT * FROM madlib.simple_silhouette( 'km_sample', -- Input points table
'points', -- Column containing points
(SELECT centroids FROM km_result), -- Centroids
'madlib.squared_dist_norm2' -- Distance function
);
</pre>
<pre class="result">
-[ RECORD 1 ]-----+------------------
simple_silhouette | 0.868174608939623
</pre>
Now calculate simplified silhouette coefficient for each point in the data set:
<pre class="example">
DROP TABLE IF EXISTS km_points_silh;
SELECT * FROM madlib.simple_silhouette_points( 'km_sample', -- Input points table
'km_points_silh', -- Output table
'pid', -- Point ID column in input table
'points', -- Points column in input table
'km_result', -- Centroids table
'centroids', -- Column in centroids table containing centroids
'madlib.squared_dist_norm2' -- Distance function
);
SELECT * FROM km_points_silh ORDER BY pid;
</pre>
<pre class="result">
pid | centroid_id | neighbor_centroid_id | silh
-----+-------------+----------------------+-------------------
1 | 1 | 0 | 0.966608819821713
2 | 1 | 0 | 0.926251125077039
3 | 1 | 0 | 0.28073008848306
4 | 0 | 1 | 0.951447083910869
5 | 1 | 0 | 0.80098239014753
6 | 0 | 1 | 0.972487557020722
7 | 0 | 1 | 0.88838568723116
8 | 0 | 1 | 0.906334719972002
9 | 1 | 0 | 0.994315140692314
10 | 1 | 0 | 0.99420347703982
(10 rows)
</pre>
-# Find the cluster assignment for each point.
Use the closest_column() function to map each point to the cluster that it belongs to.
<pre class="example">
\\x off;
DROP TABLE IF EXISTS point_cluster_map;
CREATE TABLE point_cluster_map AS
SELECT data.*, (madlib.closest_column(centroids, points, 'madlib.squared_dist_norm2')).*
FROM km_sample as data, km_result;
ALTER TABLE point_cluster_map RENAME column_id to cluster_id; -- change column name
SELECT * FROM point_cluster_map ORDER BY pid;
</pre>
<pre class="result">
pid | points | cluster_id | distance
-----+--------------------------------------------------------------------+------------+------------------
1 | {14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065} | 1 | 3296.12614333444
2 | {13.2,1.78,2.14,11.2,1,2.65,2.76,0.26,1.28,4.38,1.05,3.49,1050} | 1 | 8856.90882600111
3 | {13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.6799,1.03,3.17,1185} | 1 | 27072.3216580044
4 | {14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480} | 0 | 10261.9450375
5 | {13.24,2.59,2.87,21,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735} | 1 | 82492.8673553345
6 | {14.2,1.76,2.45,15.2,112,3.27,3.39,0.34,1.97,6.75,1.05,2.85,1450} | 0 | 5080.3612375
7 | {14.39,1.87,2.45,14.6,96,2.5,2.52,0.3,1.98,5.25,1.02,3.58,1290} | 0 | 8090.4485875
8 | {14.06,2.15,2.61,17.6,121,2.6,2.51,0.31,1.25,5.05,1.06,3.58,1295} | 0 | 7128.9931875
9 | {14.83,1.64,2.17,14,97,2.8,2.98,0.29,1.98,5.2,1.08,2.85,1045} | 1 | 634.301947334443
10 | {13.86,1.35,2.27,16,98,2.98,3.15,0.22,1.85,7.2199,1.01,3.55,1045} | 1 | 646.584486004443
(10 rows)
</pre>
-# Display cluster ID. There are two steps to get the cluster id associated
with the centroid coordinates, if you need it. First unnest the cluster
centroids 2-D array to get a set of 1-D centroid arrays:
<pre class="example">
DROP TABLE IF EXISTS km_centroids_unnest;
-- Run unnest function
CREATE TABLE km_centroids_unnest AS
SELECT (madlib.array_unnest_2d_to_1d(centroids)).*
FROM km_result;
SELECT * FROM km_centroids_unnest ORDER BY 1;
</pre>
<pre class="result">
unnest_row_id | unnest_result
---------------+------------------------------------------------------------------------------------------------------------------------------------------------------------
1 | {14.255,1.9325,2.5025,16.05,110.5,3.055,2.9775,0.2975,1.845,6.2125,0.9975,3.365,1378.75}
2 | {13.7533333333333,1.905,2.425,16.0666666666667,90.3333333333333,2.805,2.98,0.29,2.005,5.40663333333333,1.04166666666667,3.31833333333333,1020.83333333333}
(2 rows)
</pre>
Note that the ID column returned by array_unnest_2d_to_1d() is just a row ID and not the cluster ID assigned by k-means. The second step to get the cluster_id is:
<pre class="example">
SELECT cent.*, (madlib.closest_column(centroids, unnest_result, 'madlib.squared_dist_norm2')).column_id as cluster_id
FROM km_centroids_unnest as cent, km_result
ORDER BY cent.unnest_row_id;
</pre>
<pre class="result">
unnest_row_id | unnest_result | cluster_id
---------------+------------------------------------------------------------------------------------------------------------------------------------------------------------+------------
1 | {14.255,1.9325,2.5025,16.05,110.5,3.055,2.9775,0.2975,1.845,6.2125,0.9975,3.365,1378.75} | 0
2 | {13.7533333333333,1.905,2.425,16.0666666666667,90.3333333333333,2.805,2.98,0.29,2.005,5.40663333333333,1.04166666666667,3.31833333333333,1020.83333333333} | 1
(2 rows)
</pre>
-# Run the same example as above, but using array input. Create the input table:
<pre class="example">
DROP TABLE IF EXISTS km_arrayin CASCADE;
CREATE TABLE km_arrayin(pid int,
p1 float,
p2 float,
p3 float,
p4 float,
p5 float,
p6 float,
p7 float,
p8 float,
p9 float,
p10 float,
p11 float,
p12 float,
p13 float);
INSERT INTO km_arrayin VALUES
(1, 14.23, 1.71, 2.43, 15.6, 127, 2.8, 3.0600, 0.2800, 2.29, 5.64, 1.04, 3.92, 1065),
(2, 13.2, 1.78, 2.14, 11.2, 1, 2.65, 2.76, 0.26, 1.28, 4.38, 1.05, 3.49, 1050),
(3, 13.16, 2.36, 2.67, 18.6, 101, 2.8, 3.24, 0.3, 2.81, 5.6799, 1.03, 3.17, 1185),
(4, 14.37, 1.95, 2.5, 16.8, 113, 3.85, 3.49, 0.24, 2.18, 7.8, 0.86, 3.45, 1480),
(5, 13.24, 2.59, 2.87, 21, 118, 2.8, 2.69, 0.39, 1.82, 4.32, 1.04, 2.93, 735),
(6, 14.2, 1.76, 2.45, 15.2, 112, 3.27, 3.39, 0.34, 1.97, 6.75, 1.05, 2.85, 1450),
(7, 14.39, 1.87, 2.45, 14.6, 96, 2.5, 2.52, 0.3, 1.98, 5.25, 1.02, 3.58, 1290),
(8, 14.06, 2.15, 2.61, 17.6, 121, 2.6, 2.51, 0.31, 1.25, 5.05, 1.06, 3.58, 1295),
(9, 14.83, 1.64, 2.17, 14, 97, 2.8, 2.98, 0.29, 1.98, 5.2, 1.08, 2.85, 1045),
(10, 13.86, 1.35, 2.27, 16, 98, 2.98, 3.15, 0.22, 1.8500, 7.2199, 1.01, 3.55, 1045);
</pre>
Now find the cluster assignment for each point using random seeding:
<pre class="example">
DROP TABLE IF EXISTS km_result_array;
-- Run kmeans algorithm
CREATE TABLE km_result_array AS
SELECT * FROM madlib.kmeans_random('km_arrayin', -- Table of source data
'ARRAY[p1, p2, p3, p4, p5, p6, -- Points
p7, p8, p9, p10, p11, p12, p13]',
2, -- Number of centroids to calculate
'madlib.squared_dist_norm2', -- Distance function
'madlib.avg', -- Aggregate function
20, -- Number of iterations
0.001); -- Fraction of centroids reassigned to keep iterating
-- Get point assignment
SELECT data.*, (madlib.closest_column(centroids,
ARRAY[p1, p2, p3, p4, p5, p6,
p7, p8, p9, p10, p11, p12, p13],
'madlib.squared_dist_norm2')).column_id as cluster_id
FROM km_arrayin as data, km_result_array
ORDER BY data.pid;
</pre>
This produces the result in column format:
<pre class="result">
pid | p1 | p2 | p3 | p4 | p5 | p6 | p7 | p8 | p9 | p10 | p11 | p12 | p13 | cluster_id
-----+-------+------+------+------+-----+------+------+------+------+--------+------+------+------+------------
1 | 14.23 | 1.71 | 2.43 | 15.6 | 127 | 2.8 | 3.06 | 0.28 | 2.29 | 5.64 | 1.04 | 3.92 | 1065 | 0
2 | 13.2 | 1.78 | 2.14 | 11.2 | 1 | 2.65 | 2.76 | 0.26 | 1.28 | 4.38 | 1.05 | 3.49 | 1050 | 0
3 | 13.16 | 2.36 | 2.67 | 18.6 | 101 | 2.8 | 3.24 | 0.3 | 2.81 | 5.6799 | 1.03 | 3.17 | 1185 | 1
4 | 14.37 | 1.95 | 2.5 | 16.8 | 113 | 3.85 | 3.49 | 0.24 | 2.18 | 7.8 | 0.86 | 3.45 | 1480 | 1
5 | 13.24 | 2.59 | 2.87 | 21 | 118 | 2.8 | 2.69 | 0.39 | 1.82 | 4.32 | 1.04 | 2.93 | 735 | 0
6 | 14.2 | 1.76 | 2.45 | 15.2 | 112 | 3.27 | 3.39 | 0.34 | 1.97 | 6.75 | 1.05 | 2.85 | 1450 | 1
7 | 14.39 | 1.87 | 2.45 | 14.6 | 96 | 2.5 | 2.52 | 0.3 | 1.98 | 5.25 | 1.02 | 3.58 | 1290 | 1
8 | 14.06 | 2.15 | 2.61 | 17.6 | 121 | 2.6 | 2.51 | 0.31 | 1.25 | 5.05 | 1.06 | 3.58 | 1295 | 1
9 | 14.83 | 1.64 | 2.17 | 14 | 97 | 2.8 | 2.98 | 0.29 | 1.98 | 5.2 | 1.08 | 2.85 | 1045 | 0
10 | 13.86 | 1.35 | 2.27 | 16 | 98 | 2.98 | 3.15 | 0.22 | 1.85 | 7.2199 | 1.01 | 3.55 | 1045 | 0
(10 rows)
</pre>
<h4>Auto Clustering for Range of <em>k</em> Values</h4>
-# Auto k selection. Now let's run k-means random for a variety of k values
and compare using simple silhouette and elbow methods.
<pre class="example">
DROP TABLE IF EXISTS km_result_auto, km_result_auto_summary;
SELECT madlib.kmeans_random_auto(
'km_sample', -- points table
'km_result_auto', -- output table
'points', -- column name in point table
ARRAY[2,3,4,5,6], -- k values to try
'madlib.squared_dist_norm2', -- distance function
'madlib.avg', -- aggregate function
20, -- max iterations
0.001, -- minimum fraction of centroids reassigned to continue iterating
'both' -- k selection algorithm (simple silhouette and elbow)
);
\\x on
SELECT * FROM km_result_auto_summary;
</pre>
<pre class="result">-[ RECORD 1 ]-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
k | 6
centroids | {{14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065},{13.24,2.59,2.87,21,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735},{14.285,1.855,2.475,16,112.5,3.56,3.44,0.29,2.075,7.275,0.955,3.15,1465},{13.87,2.12666666666667,2.57666666666667,16.9333333333333,106,2.63333333333333,2.75666666666667,0.303333333333333,2.01333333333333,5.32663333333333,1.03666666666667,3.44333333333333,1256.66666666667},{14.345,1.495,2.22,15,97.5,2.89,3.065,0.255,1.915,6.20995,1.045,3.2,1045},{13.2,1.78,2.14,11.2,1,2.65,2.76,0.26,1.28,4.38,1.05,3.49,1050}}
cluster_variance | {0,0,452.7633,8078.22646267333,5.346498005,0}
objective_fn | 8536.33626067833
frac_reassigned | 0
num_iterations | 3
silhouette | 0.954496117675027
elbow | -50296.3677976633
selection_algorithm | silhouette
</pre>
The best selection above is made by the silhouette algorithm by default.
Note that the elbow method may select a different k value as the best.
To see results for all k values:
<pre class="example">
SELECT * FROM km_result_auto ORDER BY k;
</pre>
<pre class="result">
-[ RECORD 1 ]----+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
k | 2
centroids | {{14.036,2.018,2.536,16.56,108.6,3.004,3.03,0.298,2.038,6.10598,1.004,3.326,1340},{13.872,1.814,2.376,15.56,88.2,2.806,2.928,0.288,1.844,5.35198,1.044,3.348,988}}
cluster_variance | {60672.638245208,90512.324426408}
objective_fn | 151184.962671616
frac_reassigned | 0
num_iterations | 3
silhouette | 0.872087020146542
elbow | -1056.25028126836
-[ RECORD 2 ]----+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
k | 3
centroids | {{13.87,2.12666666666667,2.57666666666667,16.9333333333333,106,2.63333333333333,2.75666666666667,0.303333333333333,2.01333333333333,5.32663333333333,1.03666666666667,3.44333333333333,1256.66666666667},{14.285,1.855,2.475,16,112.5,3.56,3.44,0.29,2.075,7.275,0.955,3.15,1465},{13.872,1.814,2.376,15.56,88.2,2.806,2.928,0.288,1.844,5.35198,1.044,3.348,988}}
cluster_variance | {8078.22646267333,452.7633,90512.324426408}
objective_fn | 99043.3141890814
frac_reassigned | 0
num_iterations | 2
silhouette | 0.895849874221733
elbow | 20549.7753189989
-[ RECORD 3 ]----+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
k | 4
centroids | {{14.02,1.765,2.385,16.05,105.75,2.845,3.1075,0.2725,2.2325,5.93495,1.04,3.3725,1085},{13.24,2.59,2.87,21,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735},{14.255,1.9325,2.5025,16.05,110.5,3.055,2.9775,0.2975,1.845,6.2125,0.9975,3.365,1378.75},{13.2,1.78,2.14,11.2,1,2.65,2.76,0.26,1.28,4.38,1.05,3.49,1050}}
cluster_variance | {14227.41709401,0,30561.74805,0}
objective_fn | 44789.16514401
frac_reassigned | 0
num_iterations | 3
silhouette | 0.877839150000438
elbow | 17535.7421610686
-[ RECORD 4 ]----+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
k | 5
centroids | {{14.285,1.855,2.475,16,112.5,3.56,3.44,0.29,2.075,7.275,0.955,3.15,1465},{14.225,2.01,2.53,16.1,108.5,2.55,2.515,0.305,1.615,5.15,1.04,3.58,1292.5},{13.2,1.78,2.14,11.2,1,2.65,2.76,0.26,1.28,4.38,1.05,3.49,1050},{14.04,1.8225,2.435,16.65,110,2.845,2.97,0.295,1.985,5.594975,1.0425,3.3125,972.5},{13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.6799,1.03,3.17,1185}}
cluster_variance | {452.7633,329.8988,0,76176.4564000075,0}
objective_fn | 76959.1185000075
frac_reassigned | 0
num_iterations | 2
silhouette | 0.771207558995578
elbow | -28690.3421973961
-[ RECORD 5 ]----+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
k | 6
centroids | {{14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065},{13.24,2.59,2.87,21,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735},{14.285,1.855,2.475,16,112.5,3.56,3.44,0.29,2.075,7.275,0.955,3.15,1465},{13.87,2.12666666666667,2.57666666666667,16.9333333333333,106,2.63333333333333,2.75666666666667,0.303333333333333,2.01333333333333,5.32663333333333,1.03666666666667,3.44333333333333,1256.66666666667},{14.345,1.495,2.22,15,97.5,2.89,3.065,0.255,1.915,6.20995,1.045,3.2,1045},{13.2,1.78,2.14,11.2,1,2.65,2.76,0.26,1.28,4.38,1.05,3.49,1050}}
cluster_variance | {0,0,452.7633,8078.22646267333,5.346498005,0}
objective_fn | 8536.33626067833
frac_reassigned | 0
num_iterations | 3
silhouette | 0.954496117675027
elbow | -50296.3677976633
</pre>
-# Simplified silhouette per point. Let's say we want the simplified silhouette
coefficient for each point in the data set, for the case where <em>k=3</em>:
<pre class="example">
DROP TABLE IF EXISTS km_points_silh;
SELECT * FROM madlib.simple_silhouette_points( 'km_sample', -- Input points table
'km_points_silh', -- Output table
'pid', -- Point ID column in input table
'points', -- Points column in input table
(SELECT centroids FROM km_result_auto WHERE k=3), -- centroids array
'madlib.squared_dist_norm2' -- Distance function
);
SELECT * FROM km_points_silh ORDER BY pid;
</pre>
<pre class="result">
pid | centroid_id | neighbor_centroid_id | silh
-----+-------------+----------------------+-------------------
1 | 2 | 0 | 0.800019825058391
2 | 2 | 0 | 0.786712987562913
3 | 0 | 2 | 0.867496080386644
4 | 1 | 0 | 0.995466498952947
5 | 2 | 0 | 0.761551610381542
6 | 1 | 0 | 0.993950286967157
7 | 0 | 1 | 0.960621637528528
8 | 0 | 1 | 0.941493577405087
9 | 2 | 0 | 0.925822020308802
10 | 2 | 0 | 0.92536421766532
(10 rows)
</pre>
@anchor background
@par Technical Background
<b>k-means Algorithm</b>
<br>
Formally, we wish to minimize the following objective function:
\f[
(c_1, \dots, c_k) \mapsto \sum_{i=1}^n \min_{j=1}^k \operatorname{dist}(x_i, c_j)
\f]
In the most common case, \f$ \operatorname{dist} \f$ is the square of the
Euclidean distance, though other distance measures can be used.
This problem is computationally difficult (NP-hard), yet the
local-search heuristic proposed by Lloyd <a href="#kmeans-lit-4">[4]</a> performs reasonably well in
practice. In fact, it is so ubiquitous today that it is
often referred to as the <em>standard algorithm</em> or even just the
<em>k-means algorithm</em>. It works as follows:
-# Seed the \f$ k \f$ centroids, meaning specify their initial positions (see below).
-# Repeat until convergence:
-# Assign each point to its closest centroid.
-# Move each centroid to a position that minimizes the sum of distances in this
cluster.
-# Convergence is achieved when no points change their assignments during step
2a.
Since the objective function decreases in every step, this algorithm is
guaranteed to converge to a local optimum.
The quality of k-means is highly influenced by the choice of the seeding.
In MADlib we support uniform-at-random sampling, kmeans++ as well as the
ability for the user to enter manual seeding based on other methods.
k-means++ seeding <a href="#kmeans-lit-1">[1]</a> starts with a single centroid chosen randomly among the input points. It then
iteratively chooses new centroids from the input points until there are a total of k centroids. The
probability for picking a particular point is proportional to its minimum distance to any existing
centroid. Intuitively, k-means++ favors seedings where centroids are spread out over the whole
range of the input points, while at the same time not being too susceptible to outliers.
@anchor simplified_silhouette
<b>Silhouette</b>
<br>
To evaluate the validity of clustering with different k values,
the objective function is not ideal because it decreases as
k value increases, so it has a bias
toward selecting the largest k as the best result <a href="#kmeans-lit-6">[6]</a>.
Therefore we use other internal measures to evaluate cluster validity.
One such measure is silhouette score. The original formulation <a href="#kmeans-lit-7">[7]</a>
computes the average distance of a data
point to all the other data points in its own cluster, and to all the data points
in the neighbouring cluster nearest to the data point.
This is expensive for a large number of points
since it requires the full pairwise distance matrix over all data.
In the simplified silhouette <a href="#kmeans-lit-3">[3]</a> which is used in MADlib,
the distance of a data point to a cluster
is represented with the distance to the cluster centroid instead of the
average distance to all (other) data points in the cluster.
This has the advantage of being much faster to compute, and can be shown
to be comparable in performance to the full silhouette <a href="#kmeans-lit-8">[8]</a>.
@anchor elbow
<b>Elbow Method</b>
<br>
The elbow method considers the percentage of variance explained as a function of number of clusters.
The idea is not to add another cluster if it doesn't model the data better.
Graphically it means identifying the "elbow" in the curve of sum of squared errors
vs. number of clusters (k). This was possibly originally suggested in <a href="#kmeans-lit-9">[9]</a>.
To locate the elbow, we use the 2nd derivative of the
objective function using the NumPy gradient function <a href="#kmeans-lit-2">[2]</a>.
@anchor literature
@literature
@anchor kmeans-lit-1
[1] David Arthur, Sergei Vassilvitskii: k-means++: the advantages of careful
seeding, Proceedings of the 18th Annual ACM-SIAM Symposium on Discrete
Algorithms (SODA'07), pp. 1027-1035.
@anchor kmeans-lit-2
[2] NumPy gradient function, https://docs.scipy.org/doc/numpy/reference/generated/numpy.gradient.html
@anchor kmeans-lit-3
[3] E. R. Hruschka, L. N. C. Silva, R. J. G. B. Campello: Clustering
Gene-Expression Data: A Hybrid Approach that Iterates Between k-Means and
Evolutionary Search. In: Studies in Computational Intelligence - Hybrid
Evolutionary Algorithms. pp. 313-335. Springer. 2007.
@anchor kmeans-lit-4
[4] Lloyd, Stuart: Least squares quantization in PCM. Technical Note, Bell
Laboratories. Published much later in: IEEE Transactions on Information
Theory 28(2), pp. 128-137. 1982.
@anchor kmeans-lit-5
[5] Leisch, Friedrich: A Toolbox for K-Centroids Cluster Analysis. In: Computational
Statistics and Data Analysis, 51(2). pp. 526-544. 2006.
@anchor kmeans-lit-6
[6] Jain, A.K.: Data clustering: 50 years beyond k-means. Pattern recognition
letters 31(8), 651–666 (2010)
@anchor kmeans-lit-7
[7] Peter J. Rousseeuw (1987). “Silhouettes: a Graphical Aid to the Interpretation
and Validation of Cluster Analysis”. Computational and Applied Mathematics 20: 53-65
@anchor kmeans-lit-8
[8] Wang F., Franco-Penya HH., Kelleher J.D., Pugh J., Ross R. (2017) An Analysis
of the Application of Simplified Silhouette to the Evaluation of k-means Clustering
Validity. In: Perner P. (eds) Machine Learning and Data Mining in Pattern Recognition.
MLDM 2017. Lecture Notes in Computer Science, vol 10358. Springer, Cham
@anchor kmeans-lit-9
[9] Robert L. Thorndike (December 1953). "Who Belongs in the Family?". Psychometrika. 18 (4): 267–276. doi:10.1007/BF02289263.
@anchor related
@par Related Topics
File kmeans.sql_in documenting the k-Means SQL functions
\ref grp_svec
closest_column()
array_unnest_2d_to_1d()
@internal
@sa namespace kmeans (documenting the implementation in Python)
@endinternal
*/
/*
* @brief k-Means return type
*
* A composite value:
* - <tt>centroids</tt> - Matrix containing the new \f$ l \leq k \f$
* repositioned centroids as columns. If this matrix has \f$ l < k \f$
* columns, one or more old centroids no longer were closest to any point.
* - <tt>old_centroid_ids</tt> - The order of the centroids in
* <tt>centroid</tt> is not guaranteed to be consitent across iterations.
* In particular, if a centroid is no longer closest to any point it can be
* dropped and a new centroid is added afterwards. We therefore need to map
* positions in <tt>centroids</tt> to the respective positions in the
* previous iteration.
* - <tt>objective_fn</tt> - Value of the objective function, i.e.,
* \f$ \sum_{x \in P} \dist(x, C)^2 \f$ where
* \f$ P \f$ is the set of points, \f$ C \f$ is the set of centroids, and
* \f$ \dist(x, C) := \min_{c \in C} \operatorname{dist}(x, c) \f$.
* - <tt>frac_reassigned</tt> - Fraction of points that was assigned a
* different centroid in the current iteration.
* - <tt>num_iterations</tt> - Number of iterations performed (so far).
*/
DROP TYPE IF EXISTS MADLIB_SCHEMA.kmeans_result CASCADE;
CREATE TYPE MADLIB_SCHEMA.kmeans_result AS (
centroids DOUBLE PRECISION[][],
cluster_variance DOUBLE PRECISION[],
objective_fn DOUBLE PRECISION,
frac_reassigned DOUBLE PRECISION,
num_iterations INTEGER
);
/*
* @brief k-Means inter-iteration state type
*
* A composite value like \ref{kmeans_result}. Additional fields:
* - <tt>old_centroid_its</tt> - The order of the centroids in
* <tt>centroid</tt> is not guaranteed to be consitent across iterations.
* In particular, if a centroid is no longer closest to any point it can be
* dropped and a new centroid is added afterwards. We therefore need to map
* positions in <tt>centroids</tt> to the respective positions in the
* previous iteration.
* - <tt>num_iterations</tt> - Number of iterations performed (so far).
*/
DROP TYPE IF EXISTS MADLIB_SCHEMA.kmeans_state CASCADE;
CREATE TYPE MADLIB_SCHEMA.kmeans_state AS (
centroids DOUBLE PRECISION[][],
old_centroid_ids INTEGER[],
cluster_variance DOUBLE PRECISION[],
objective_fn DOUBLE PRECISION,
frac_reassigned DOUBLE PRECISION
);
/**
* @internal
* @brief Execute a SQL command where $1, ..., $4 are substituted with the
* given arguments.
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.internal_execute_using_kmeans_args(
sql VARCHAR, DOUBLE PRECISION[][], REGPROC, INTEGER, DOUBLE PRECISION, VARCHAR
) RETURNS VOID
VOLATILE
CALLED ON NULL INPUT
LANGUAGE c
AS 'MODULE_PATHNAME', 'exec_sql_using'
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.internal_compute_kmeans(
rel_args VARCHAR,
rel_state VARCHAR,
rel_source VARCHAR,
expr_point VARCHAR,
agg_centroid VARCHAR)
RETURNS INTEGER
VOLATILE
LANGUAGE plpythonu
AS $$PythonFunction(kmeans, kmeans, compute_kmeans)$$
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
-- Validate the kmeans arguments
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__kmeans_validate_src(
rel_source VARCHAR
) RETURNS VOID AS $$
PythonFunction(kmeans, kmeans, kmeans_validate_src)
$$ LANGUAGE plpythonu
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__kmeans_validate_expr(
rel_source VARCHAR,
expr_point VARCHAR
) RETURNS BOOLEAN AS $$
PythonFunction(kmeans, kmeans, kmeans_validate_expr)
$$ LANGUAGE plpythonu
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__seeding_validate_args(
rel_source VARCHAR,
expr_point VARCHAR,
k INTEGER,
initial_centroids DOUBLE PRECISION[][], /*+ DEFAULT NULL */
seeding_sample_ratio DOUBLE PRECISION /*+ DEFAULT 1.0 */
) RETURNS VOID AS $$
DECLARE
num_centroids INTEGER;
num_points INTEGER;
array_dim INTEGER[];
rel_source_regclass REGCLASS;
rel_filtered VARCHAR;
BEGIN
-- Validate the expr_point input. Since we don't need a view at this
-- point, the output is safe to ignore.
PERFORM MADLIB_SCHEMA.__kmeans_validate_expr(rel_source,expr_point);
rel_source_regclass := rel_source;
IF (initial_centroids IS NOT NULL) THEN
num_centroids := array_upper(initial_centroids,1);
ELSE
num_centroids := k;
END IF;
IF (k <= 0) THEN
RAISE EXCEPTION 'Number of clusters k must be a positive integer.';
END IF;
IF (k > 32767) THEN
RAISE EXCEPTION 'Kmeans Error: Number of clusters exceeded maximum limit
Number of clusters k must be <= 32767 (for results to be returned in a
reasonable amount of time).';
END IF;
EXECUTE $sql$ SELECT count(*)
FROM $sql$ || textin(regclassout(rel_source_regclass)) || $sql$
WHERE abs(coalesce(MADLIB_SCHEMA.svec_elsum($sql$ || expr_point || $sql$), 'Infinity'::FLOAT8)) < 'Infinity'::FLOAT8 $sql$
INTO num_points ;
IF (num_points < k OR num_points < num_centroids) THEN
RAISE EXCEPTION 'Number of centroids is greater than number of points.';
END IF;
IF (k < num_centroids) THEN
RAISE WARNING 'Kmeans Warning: Number of initial centroids more than requested clusters.
Number of clusters k is less than number of supplied initial centroids.
Number of final clusters will equal number of supplied initial centroids.';
END IF;
IF (seeding_sample_ratio <= 0.) THEN
RAISE EXCEPTION 'Kmeans Error: Sampling ratio should be greater than 0 to perform seeding.';
END IF;
EXECUTE $sql$
SELECT array_agg(length) FROM (
SELECT (
array_upper( $sql$ || expr_point || $sql$::float8[], 1)
- array_lower( $sql$ || expr_point || $sql$::float8[], 1) + 1
) AS length,
array_lower( $sql$ || expr_point || $sql$::float8[], 1) AS lower
FROM $sql$ || textin(regclassout(rel_source_regclass)) || $sql$
WHERE abs(coalesce(MADLIB_SCHEMA.svec_elsum($sql$ || expr_point || $sql$), 'Infinity'::FLOAT8)) < 'Infinity'::FLOAT8
GROUP BY length, lower) t $sql$
INTO array_dim;
IF (array_upper(array_dim, 1) > 1) THEN
RAISE EXCEPTION 'Kmeans error: Feature column has arrays of different dimensions.
The input table contains arrays (evaluated from ''expr_point'') of different dimensions.';
END IF;
IF (initial_centroids is not NULL) THEN
IF (array_upper(initial_centroids, 2) != array_dim[1]) THEN
RAISE EXCEPTION 'Kmeans error: mismatched dimensionality between initial centroids and feature array
The input table contains arrays (evaluated from expr_point) of
dimension (%) which is not same as dimension of
provided initial centroid (%).',
array_dim[1], array_upper(initial_centroids, 2);
END IF;
END IF;
END;
$$ LANGUAGE plpgsql VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__seeding_validate_args(
rel_source VARCHAR,
expr_point VARCHAR,
k INTEGER,
initial_centroids DOUBLE PRECISION[][] /*+ DEFAULT NULL */
) RETURNS VOID AS $$
DECLARE
BEGIN
PERFORM MADLIB_SCHEMA.__seeding_validate_args($1, $2, $3, $4,
1.0::DOUBLE PRECISION);
END;
$$ LANGUAGE plpgsql VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
/**
* @brief Perform Lloyd's k-means local-search heuristic
*
* @param rel_source Name of the relation containing input points
* @param expr_point Expression evaluating to point coordinates for each tuple
* @param initial_centroids Matrix containing the initial centroids as columns
* @param fn_dist Name of a function with signature
* <tt>DOUBLE PRECISION[] x DOUBLE PRECISION[] -> DOUBLE PRECISION</tt> that
* returns the distance between two points. The default is the
* \ref squared_dist_norm2(float8[],float8[]) "squared Euclidean distance".
* @param agg_centroid Name of an aggregate function with signature
* <tt>DOUBLE PRECISION[] -> DOUBLE PRECISION[]</tt> that, for each group
* of points, returns a centroid. In order for Lloyd's local-search
* heuristic to provably converge and to return a local minimum, this
* centroid should minimize the sum of distances between each point in the
* group and the centroid. The default is the
* \ref avg(float8[]) "average (mean/barycenter in Euclidean space)",
* which satisfies this property if <tt>fn_dist = 'squared_dist_norm2'</tt>.
* @param max_num_iterations Maximum number of iterations
* @param min_frac_reassigned Fraction of reassigned points below which
* convergence is assumed and the algorithm terminates
* @returns A composite value:
* - <tt>centroids</tt> - Matrix with \f$ k \f$ centroids as columns.
* - <tt>frac_reassigned</tt> - Fraction of points that were assigned a
* different centroid in the last iteration.
* - <tt>num_iterations</tt> - The number of iterations before the
* algorithm terminated
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeans(
rel_source VARCHAR,
expr_point VARCHAR,
initial_centroids DOUBLE PRECISION[][],
fn_dist VARCHAR /*+ DEFAULT 'squared_dist_norm2' */,
agg_centroid VARCHAR /*+ DEFAULT 'avg' */,
max_num_iterations INTEGER /*+ DEFAULT 20 */,
min_frac_reassigned DOUBLE PRECISION /*+ DEFAULT 0.001 */
) RETURNS MADLIB_SCHEMA.kmeans_result AS $$
DECLARE
theIteration INTEGER;
theResult MADLIB_SCHEMA.kmeans_result;
oldClientMinMessages VARCHAR;
class_rel_source REGCLASS;
proc_fn_dist REGPROCEDURE;
proc_agg_centroid REGPROCEDURE;
rel_filtered VARCHAR;
num_points INTEGER;
k INTEGER;
centroids FLOAT8[];
data_size INTEGER;
init_size INTEGER;
BEGIN
oldClientMinMessages :=
(SELECT setting FROM pg_settings WHERE name = 'client_min_messages');
EXECUTE 'SET client_min_messages TO warning';
PERFORM MADLIB_SCHEMA.__kmeans_validate_src(rel_source);
IF (array_upper(initial_centroids,1) IS NULL) THEN
RAISE EXCEPTION 'Kmeans error: No valid initial centroids given.';
END IF;
centroids := ARRAY(SELECT unnest(initial_centroids));
IF (SELECT MADLIB_SCHEMA.svec_elsum(centroids)) >= 'Infinity'::float THEN
RAISE EXCEPTION 'Kmeans error: At least one initial centroid has non-finite values.';
END IF;
-- init_size := (SELECT array_upper(initial_centroids[1], 1));
-- EXECUTE 'SELECT ARRAY_UPPER(' || expr_point || ', 1) FROM ' || rel_source || ' limit 1;' INTO data_size;
-- IF (init_size != data_size) THEN
-- RAISE EXCEPTION 'The initial centroids should have the same dimension as the data point!';
-- END IF;
class_rel_source := rel_source;
proc_fn_dist := fn_dist
|| '(DOUBLE PRECISION[], DOUBLE PRECISION[])';
-- Handle PG11 pg_proc table changes
IF (SELECT MADLIB_SCHEMA.is_pg_major_version_less_than(11) = TRUE) THEN
IF (SELECT prorettype != 'DOUBLE PRECISION'::regtype OR proisagg = TRUE
FROM pg_proc WHERE oid = proc_fn_dist) THEN
RAISE EXCEPTION 'Kmeans error: Distance function has wrong signature or is not a simple function.';
END IF;
proc_agg_centroid := agg_centroid || '(DOUBLE PRECISION[])';
IF (SELECT prorettype != 'DOUBLE PRECISION[]'::regtype OR proisagg = FALSE
FROM pg_proc WHERE oid = proc_agg_centroid) THEN
RAISE EXCEPTION 'Kmeans error: Mean aggregate has wrong signature or is not an aggregate.';
END IF;
ELSE
IF (SELECT prorettype != 'DOUBLE PRECISION'::regtype OR prokind = 'a'
FROM pg_proc WHERE oid = proc_fn_dist) THEN
RAISE EXCEPTION 'Kmeans error: Distance function has wrong signature or is not a simple function.';
END IF;
proc_agg_centroid := agg_centroid || '(DOUBLE PRECISION[])';
IF (SELECT prorettype != 'DOUBLE PRECISION[]'::regtype OR prokind != 'a'
FROM pg_proc WHERE oid = proc_agg_centroid) THEN
RAISE EXCEPTION 'Kmeans error: Mean aggregate has wrong signature or is not an aggregate.';
END IF;
END IF;
IF (min_frac_reassigned < 0) OR (min_frac_reassigned > 1) THEN
RAISE EXCEPTION 'Kmeans error: Invalid convergence threshold (must be a fraction between 0 and 1).';
END IF;
IF (max_num_iterations < 0) THEN
RAISE EXCEPTION 'Kmeans error: Number of iterations must be a non-negative integer.';
END IF;
-- Extra parameter check added so that ERROR output is more user-readable (doesn't include Python traceback)
k := array_upper(initial_centroids,1);
IF (k <= 0) THEN
RAISE EXCEPTION 'Kmeans error: Number of clusters k must be a positive integer.';
END IF;
IF (k > 32767) THEN
RAISE EXCEPTION 'Kmeans error: Number of clusters k must be <= 32767 (for results to be returned in a reasonable amount of time).';
END IF;
EXECUTE $sql$ SELECT count(*)
FROM $sql$ || textin(regclassout(class_rel_source)) || $sql$
WHERE abs(coalesce(MADLIB_SCHEMA.svec_elsum($sql$ || expr_point || $sql$), 'Infinity'::FLOAT8)) < 'Infinity'::FLOAT8 $sql$
INTO num_points ;
IF (num_points < k) THEN
RAISE EXCEPTION 'Kmeans error: Number of centroids is greater than number of points.';
END IF;
-- We first setup the argument table. Rationale: We want to avoid all data
-- conversion between native types and Python code. Instead, we use Python
-- as a pure driver layer.
PERFORM MADLIB_SCHEMA.create_schema_pg_temp();
-- Unfortunately, the EXECUTE USING syntax is only available starting
-- PostgreSQL 8.4:
-- http://www.postgresql.org/docs/8.4/static/plpgsql-statements.html#PLPGSQL-STATEMENTS-EXECUTING-DYN
-- We therefore have to emulate.
PERFORM MADLIB_SCHEMA.internal_execute_using_kmeans_args($sql$
DROP TABLE IF EXISTS pg_temp._madlib_kmeans_args;
CREATE TABLE pg_temp._madlib_kmeans_args AS
SELECT
$1 AS initial_centroids,
array_upper($1, 1) AS k,
$2 AS fn_dist,
$3 AS max_num_iterations,
$4 AS min_frac_reassigned,
$5 as fn_dist_name;
$sql$,
initial_centroids, proc_fn_dist, max_num_iterations,
min_frac_reassigned, fn_dist);
-- Perform acutal computation.
-- Unfortunately, Greenplum and PostgreSQL <= 8.2 do not have conversion
-- operators from regclass to varchar/text.
theIteration := MADLIB_SCHEMA.internal_compute_kmeans(
'_madlib_kmeans_args',
'_madlib_kmeans_state',
textin(regclassout(class_rel_source)), expr_point,
textin(regprocout(proc_agg_centroid)));
-- Retrieve result from state table and return it
EXECUTE
$sql$
SELECT (_state).centroids, (_state).cluster_variance, (_state).objective_fn,
(_state).frac_reassigned, NULL
FROM _madlib_kmeans_state
WHERE _iteration = $sql$ || theIteration || $sql$
$sql$
INTO theResult;
-- The number of iterations are not updated in the C++ code. We do it here.
IF NOT (theResult IS NULL) THEN
theResult.num_iterations = theIteration;
END IF;
EXECUTE 'SET client_min_messages TO ' || oldClientMinMessages;
RETURN theResult;
END;
$$ LANGUAGE plpgsql VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeans(
rel_source VARCHAR,
expr_point VARCHAR,
initial_centroids DOUBLE PRECISION[][],
fn_dist VARCHAR,
agg_centroid VARCHAR,
max_num_iterations INTEGER
) RETURNS MADLIB_SCHEMA.kmeans_result
VOLATILE
LANGUAGE sql AS $$
SELECT MADLIB_SCHEMA.kmeans($1, $2, $3, $4, $5, $6, 0.001)
$$
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeans(
rel_source VARCHAR,
expr_point VARCHAR,
initial_centroids DOUBLE PRECISION[][],
fn_dist VARCHAR,
agg_centroid VARCHAR
) RETURNS MADLIB_SCHEMA.kmeans_result
VOLATILE
LANGUAGE sql AS $$
SELECT MADLIB_SCHEMA.kmeans($1, $2, $3, $4, $5, 20, 0.001)
$$
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeans(
rel_source VARCHAR,
expr_point VARCHAR,
initial_centroids DOUBLE PRECISION[][],
fn_dist VARCHAR
) RETURNS MADLIB_SCHEMA.kmeans_result
VOLATILE
LANGUAGE sql AS $$
SELECT MADLIB_SCHEMA.kmeans($1, $2, $3, $4, 'MADLIB_SCHEMA.avg', 20,
0.001)
$$
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeans(
rel_source VARCHAR,
expr_point VARCHAR,
initial_centroids DOUBLE PRECISION[][]
) RETURNS MADLIB_SCHEMA.kmeans_result
VOLATILE
LANGUAGE sql AS $$
SELECT MADLIB_SCHEMA.kmeans($1, $2, $3,
'MADLIB_SCHEMA.squared_dist_norm2', 'MADLIB_SCHEMA.avg', 20, 0.001)
$$
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.internal_execute_using_kmeanspp_seeding_args(
sql VARCHAR, INTEGER, REGPROC, DOUBLE PRECISION[][], VARCHAR
) RETURNS VOID
VOLATILE
CALLED ON NULL INPUT
LANGUAGE c
AS 'MODULE_PATHNAME', 'exec_sql_using'
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.internal_compute_kmeanspp_seeding(
rel_args VARCHAR,
rel_state VARCHAR,
rel_source VARCHAR,
expr_point VARCHAR)
RETURNS INTEGER
AS $$PythonFunction(kmeans, kmeans, compute_kmeanspp_seeding)$$
LANGUAGE plpythonu VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
/**
* @brief k-Means++ Seeding
*
* @param rel_source Name of the relation containing input points
* @param expr_point Expression evaluating to point coordinates for each tuple
* @param k Number of centroids
* @param fn_dist Name of a function with signature
* <tt>DOUBLE PRECISION[] x DOUBLE PRECISION[] -> DOUBLE PRECISION</tt> that
* returns the distance between two points
* @param initial_centroids A matrix containing up to \f$ k \f$ columns as
* columns. kmeanspp_seeding() proceeds exactly as if these centroids had
* already been generated in previous iterations. This parameter may be
* NULL in which all \f$ k \f$ centroids will be generated.
* @returns A matrix containing \f$ k \f$ centroids as columns
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeanspp_seeding(
rel_source VARCHAR,
expr_point VARCHAR,
k INTEGER,
fn_dist VARCHAR /*+ DEFAULT 'squared_dist_norm2' */,
initial_centroids DOUBLE PRECISION[][] /*+ DEFAULT NULL */,
seeding_sample_ratio DOUBLE PRECISION /*+ DEFAULT 1.0 */
) RETURNS DOUBLE PRECISION[][] AS $$
DECLARE
theIteration INTEGER;
theResult DOUBLE PRECISION[][];
oldClientMinMessages VARCHAR;
sampled_rel_source VARCHAR;
sampled_col_name VARCHAR;
class_rel_source REGCLASS;
rel_filtered VARCHAR;
proc_fn_dist REGPROCEDURE;
num_points INTEGER;
num_centroids INTEGER;
num_array_dim INTEGER;
BEGIN
oldClientMinMessages :=
(SELECT setting FROM pg_settings WHERE name = 'client_min_messages');
EXECUTE 'SET client_min_messages TO warning';
PERFORM MADLIB_SCHEMA.__kmeans_validate_src(rel_source);
PERFORM MADLIB_SCHEMA.__seeding_validate_args(
rel_source, expr_point, k, initial_centroids, seeding_sample_ratio);
proc_fn_dist := fn_dist || '(DOUBLE PRECISION[], DOUBLE PRECISION[])';
IF (SELECT MADLIB_SCHEMA.is_pg_major_version_less_than(11) = TRUE) THEN
IF (SELECT prorettype != 'DOUBLE PRECISION'::regtype OR proisagg = TRUE
FROM pg_proc WHERE oid = proc_fn_dist) THEN
RAISE EXCEPTION 'Kmeans error: Distance function has wrong signature or is not a simple function.';
END IF;
ELSE
IF (SELECT prorettype != 'DOUBLE PRECISION'::regtype OR prokind = 'a'
FROM pg_proc WHERE oid = proc_fn_dist) THEN
RAISE EXCEPTION 'Kmeans error: Distance function has wrong signature or is not a simple function.';
END IF;
END IF;
-- Unfortunately, Greenplum and PostgreSQL <= 8.2 do not have conversion
-- operators from regclass to varchar/text.
class_rel_source := rel_source;
sampled_rel_source = MADLIB_SCHEMA.__unique_string();
sampled_col_name = MADLIB_SCHEMA.__unique_string();
IF (seeding_sample_ratio < 1.0) THEN
EXECUTE 'DROP TABLE IF EXISTS '||sampled_rel_source||' CASCADE';
EXECUTE 'CREATE TEMP TABLE '||sampled_rel_source||' AS
SELECT *
FROM
(
SELECT
*,
random() AS '||sampled_col_name||'
FROM '||textin(regclassout(class_rel_source))||'
WHERE abs(
coalesce(
MADLIB_SCHEMA.svec_elsum('||expr_point||'),
''Infinity''::FLOAT8
)
) < ''Infinity''::FLOAT8
) subq
WHERE '||sampled_col_name||' < '||seeding_sample_ratio;
class_rel_source := sampled_rel_source;
END IF;
-- We first setup the argument table. Rationale: We want to avoid all data
-- conversion between native types and Python code. Instead, we use Python
-- as a pure driver layer.
oldClientMinMessages :=
(SELECT setting FROM pg_settings WHERE name = 'client_min_messages');
EXECUTE 'SET client_min_messages TO warning';
PERFORM MADLIB_SCHEMA.create_schema_pg_temp();
-- Unfortunately, the EXECUTE USING syntax is only available starting
-- PostgreSQL 8.4:
-- http://www.postgresql.org/docs/8.4/static/plpgsql-statements.html#PLPGSQL-STATEMENTS-EXECUTING-DYN
-- We therefore have to emulate.
PERFORM MADLIB_SCHEMA.internal_execute_using_kmeanspp_seeding_args($sql$
DROP TABLE IF EXISTS pg_temp._madlib_kmeanspp_args;
CREATE TEMPORARY TABLE _madlib_kmeanspp_args AS
SELECT
$1 AS k,
$2 AS fn_dist,
$3 AS initial_centroids,
$4 AS fn_dist_name
$sql$,
k, proc_fn_dist, initial_centroids, fn_dist);
EXECUTE 'SET client_min_messages TO ' || oldClientMinMessages;
-- Perform acutal computation.
theIteration := (
SELECT MADLIB_SCHEMA.internal_compute_kmeanspp_seeding(
'_madlib_kmeanspp_args', '_madlib_kmeanspp_state',
textin(regclassout(class_rel_source)), expr_point)
);
-- Retrieve result from state table and return it
EXECUTE
$sql$
SELECT _state FROM _madlib_kmeanspp_state
WHERE _iteration = $sql$ || theIteration || $sql$
$sql$
INTO theResult;
EXECUTE 'SET client_min_messages TO ' || oldClientMinMessages;
IF (seeding_sample_ratio < 1.0) THEN
EXECUTE 'DROP TABLE IF EXISTS '||sampled_rel_source||' CASCADE';
END IF;
RETURN theResult;
END;
$$ LANGUAGE plpgsql VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeanspp_seeding(
rel_source VARCHAR,
expr_point VARCHAR,
k INTEGER,
fn_dist VARCHAR,
initial_centroids DOUBLE PRECISION[][]
) RETURNS DOUBLE PRECISION[][]
LANGUAGE sql AS $$
SELECT MADLIB_SCHEMA.kmeanspp_seeding($1, $2, $3, $4, $5, 1.0::DOUBLE PRECISION)
$$
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeanspp_seeding(
rel_source VARCHAR,
expr_point VARCHAR,
k INTEGER,
fn_dist VARCHAR
) RETURNS DOUBLE PRECISION[][]
LANGUAGE sql AS $$
SELECT MADLIB_SCHEMA.kmeanspp_seeding($1, $2, $3, $4, NULL, 1.0::DOUBLE PRECISION)
$$
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeanspp_seeding(
rel_source VARCHAR,
expr_point VARCHAR,
k INTEGER
) RETURNS DOUBLE PRECISION[][]
LANGUAGE sql AS $$
SELECT MADLIB_SCHEMA.kmeanspp_seeding($1, $2, $3,
'MADLIB_SCHEMA.squared_dist_norm2', NULL, 1.0::DOUBLE PRECISION)
$$
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
/**
* @brief Run k-Means++.
*
* This is a shortcut for running k-means++. It is equivalent to
* <pre>SELECT \ref kmeans(
rel_source,
expr_point,
\ref kmeanspp_seeding(
rel_source,
expr_point,
k,
fn_dist
),
fn_dist,
agg_centroid,
max_num_iterations,
min_frac_reassigned
)</pre>
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeanspp(
rel_source VARCHAR,
expr_point VARCHAR,
k INTEGER,
fn_dist VARCHAR /*+ DEFAULT 'squared_dist_norm2' */,
agg_centroid VARCHAR /*+ DEFAULT 'avg' */,
max_num_iterations INTEGER /*+ DEFAULT 20 */,
min_frac_reassigned DOUBLE PRECISION /*+ DEFAULT 0.001 */,
seeding_sample_ratio DOUBLE PRECISION /*+ DEFAULT 1.0 */
) RETURNS MADLIB_SCHEMA.kmeans_result
VOLATILE
LANGUAGE plpgsql
AS $$
DECLARE
ret MADLIB_SCHEMA.kmeans_result;
BEGIN
ret = MADLIB_SCHEMA.kmeans(
$1, $2, MADLIB_SCHEMA.kmeanspp_seeding($1, $2, $3, $4, NULL, $8),
$4, $5, $6, $7);
RETURN ret;
END
$$
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeanspp(
rel_source VARCHAR,
expr_point VARCHAR,
k INTEGER,
fn_dist VARCHAR /*+ DEFAULT 'squared_dist_norm2' */,
agg_centroid VARCHAR /*+ DEFAULT 'avg' */,
max_num_iterations INTEGER /*+ DEFAULT 20 */,
min_frac_reassigned DOUBLE PRECISION /*+ DEFAULT 0.001 */
) RETURNS MADLIB_SCHEMA.kmeans_result
VOLATILE
LANGUAGE plpgsql
AS $$
DECLARE
ret MADLIB_SCHEMA.kmeans_result;
BEGIN
ret = MADLIB_SCHEMA.kmeanspp($1, $2, $3, $4, $5, $6, $7, 1.0::DOUBLE PRECISION);
RETURN ret;
END
$$
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeanspp(
rel_source VARCHAR,
expr_point VARCHAR,
k INTEGER,
fn_dist VARCHAR,
agg_centroid VARCHAR,
max_num_iterations INTEGER
) RETURNS MADLIB_SCHEMA.kmeans_result
VOLATILE
LANGUAGE plpgsql
AS $$
DECLARE
ret MADLIB_SCHEMA.kmeans_result;
BEGIN
ret = MADLIB_SCHEMA.kmeanspp($1, $2, $3, $4, $5, $6,
0.001::DOUBLE PRECISION, 1.0::DOUBLE PRECISION);
RETURN ret;
END
$$
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeanspp(
rel_source VARCHAR,
expr_point VARCHAR,
k INTEGER,
fn_dist VARCHAR,
agg_centroid VARCHAR
) RETURNS MADLIB_SCHEMA.kmeans_result
VOLATILE
LANGUAGE plpgsql
AS $$
DECLARE
ret MADLIB_SCHEMA.kmeans_result;
BEGIN
ret = MADLIB_SCHEMA.kmeanspp($1, $2, $3, $4, $5, 20::INTEGER,
0.001::DOUBLE PRECISION, 1.0::DOUBLE PRECISION);
RETURN ret;
END
$$
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeanspp(
rel_source VARCHAR,
expr_point VARCHAR,
k INTEGER,
fn_dist VARCHAR
) RETURNS MADLIB_SCHEMA.kmeans_result
VOLATILE
LANGUAGE plpgsql
AS $$
DECLARE
ret MADLIB_SCHEMA.kmeans_result;
BEGIN
ret = MADLIB_SCHEMA.kmeanspp($1, $2, $3, $4,
'MADLIB_SCHEMA.avg'::VARCHAR, 20::INTEGER,
0.001::DOUBLE PRECISION, 1.0::DOUBLE PRECISION);
RETURN ret;
END
$$
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeanspp(
rel_source VARCHAR,
expr_point VARCHAR,
k INTEGER
) RETURNS MADLIB_SCHEMA.kmeans_result
VOLATILE
LANGUAGE plpgsql
AS $$
DECLARE
ret MADLIB_SCHEMA.kmeans_result;
BEGIN
ret = MADLIB_SCHEMA.kmeanspp($1, $2, $3,
'MADLIB_SCHEMA.squared_dist_norm2'::VARCHAR,
'MADLIB_SCHEMA.avg'::VARCHAR, 20::INTEGER,
0.001::DOUBLE PRECISION, 1.0::DOUBLE PRECISION);
RETURN ret;
END
$$
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.internal_execute_using_kmeans_random_seeding_args(
sql VARCHAR, INTEGER, DOUBLE PRECISION[][]
) RETURNS VOID
VOLATILE
CALLED ON NULL INPUT
LANGUAGE c
AS 'MODULE_PATHNAME', 'exec_sql_using'
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.internal_compute_kmeans_random_seeding(
rel_args VARCHAR,
rel_state VARCHAR,
rel_source VARCHAR,
expr_point VARCHAR)
RETURNS INTEGER
AS $$PythonFunction(kmeans, kmeans, compute_kmeans_random_seeding)$$
LANGUAGE plpythonu VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
/**
* @brief k-Means Random Seeding
*
* @param rel_source Name of the relation containing input points
* @param expr_point Expression evaluating to point coordinates for each tuple
* @param k Number of centroids
* @param initial_centroids A matrix containing up to \f$ k \f$ columns as
* columns. kmeanspp_seeding() proceeds exactly as if these centroids had
* already been generated in previous iterations. This parameter may be
* NULL in which all \f$ k \f$ centroids will be generated.
* @returns A matrix containing \f$ k \f$ centroids as columns
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeans_random_seeding(
rel_source VARCHAR,
expr_point VARCHAR,
k INTEGER,
initial_centroids DOUBLE PRECISION[][] /*+ DEFAULT NULL */
) RETURNS DOUBLE PRECISION[][] AS $$
DECLARE
theIteration INTEGER;
theResult DOUBLE PRECISION[][];
oldClientMinMessages VARCHAR;
class_rel_source REGCLASS;
num_points INTEGER;
num_centroids INTEGER;
rel_filtered VARCHAR;
BEGIN
oldClientMinMessages :=
(SELECT setting FROM pg_settings WHERE name = 'client_min_messages');
EXECUTE 'SET client_min_messages TO warning';
PERFORM MADLIB_SCHEMA.__kmeans_validate_src(rel_source);
PERFORM MADLIB_SCHEMA.__seeding_validate_args(
rel_source, expr_point, k, initial_centroids);
-- We first setup the argument table. Rationale: We want to avoid all data
-- conversion between native types and Python code. Instead, we use Python
-- as a pure driver layer.
oldClientMinMessages :=
(SELECT setting FROM pg_settings WHERE name = 'client_min_messages');
EXECUTE 'SET client_min_messages TO warning';
PERFORM MADLIB_SCHEMA.create_schema_pg_temp();
-- Unfortunately, the EXECUTE USING syntax is only available starting
-- PostgreSQL 8.4:
-- http://www.postgresql.org/docs/8.4/static/plpgsql-statements.html#PLPGSQL-STATEMENTS-EXECUTING-DYN
-- We therefore have to emulate.
PERFORM MADLIB_SCHEMA.internal_execute_using_kmeans_random_seeding_args($sql$
DROP TABLE IF EXISTS pg_temp._madlib_kmeans_random_args;
CREATE TEMPORARY TABLE _madlib_kmeans_random_args AS
SELECT $1 AS k, $2 AS initial_centroids;
$sql$,
k, initial_centroids);
EXECUTE 'SET client_min_messages TO ' || oldClientMinMessages;
-- Perform acutal computation.
-- Unfortunately, Greenplum and PostgreSQL <= 8.2 do not have conversion
-- operators from regclass to varchar/text.
class_rel_source := rel_source;
theIteration := (
SELECT MADLIB_SCHEMA.internal_compute_kmeans_random_seeding(
'_madlib_kmeans_random_args', '_madlib_kmeans_random_state',
textin(regclassout(class_rel_source)), expr_point)
);
-- Retrieve result from state table and return it
EXECUTE
$sql$
SELECT _state FROM _madlib_kmeans_random_state
WHERE _iteration = $sql$ || theIteration || $sql$
$sql$
INTO theResult;
EXECUTE 'SET client_min_messages TO ' || oldClientMinMessages;
RETURN theResult;
END;
$$ LANGUAGE plpgsql VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeans_random_seeding(
rel_source VARCHAR,
expr_point VARCHAR,
k INTEGER
) RETURNS DOUBLE PRECISION[][]
LANGUAGE sql AS $$
SELECT MADLIB_SCHEMA.kmeans_random_seeding($1, $2, $3, NULL)
$$
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
/**
* @brief Run k-Means with random seeding.
*
* This is a shortcut for running k-means with random seeding. It is equivalent
* to
* <pre>SELECT \ref kmeans(
rel_source,
expr_point,
\ref kmeans_random_seeding(
rel_source,
expr_point,
k
),
fn_dist,
agg_centroid,
max_num_iterations,
min_frac_reassigned
)</pre>
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeans_random(
rel_source VARCHAR,
expr_point VARCHAR,
k INTEGER,
fn_dist VARCHAR /*+ DEFAULT 'squared_dist_norm2' */,
agg_centroid VARCHAR /*+ DEFAULT 'avg' */,
max_num_iterations INTEGER /*+ DEFAULT 20 */,
min_frac_reassigned DOUBLE PRECISION /*+ DEFAULT 0.001 */
) RETURNS MADLIB_SCHEMA.kmeans_result
VOLATILE
LANGUAGE plpgsql
AS $$
DECLARE
ret MADLIB_SCHEMA.kmeans_result;
BEGIN
ret = MADLIB_SCHEMA.kmeans(
$1, $2, MADLIB_SCHEMA.kmeans_random_seeding($1, $2, $3),
$4, $5, $6, $7);
RETURN ret;
END
$$
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeans_random(
rel_source VARCHAR,
expr_point VARCHAR,
k INTEGER,
fn_dist VARCHAR,
agg_centroid VARCHAR,
max_num_iterations INTEGER
) RETURNS MADLIB_SCHEMA.kmeans_result
VOLATILE
LANGUAGE plpgsql
AS $$
DECLARE
ret MADLIB_SCHEMA.kmeans_result;
BEGIN
ret = MADLIB_SCHEMA.kmeans(
$1, $2, MADLIB_SCHEMA.kmeans_random_seeding($1, $2, $3),
$4, $5, $6, 0.001);
RETURN ret;
END
$$
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeans_random(
rel_source VARCHAR,
expr_point VARCHAR,
k INTEGER,
fn_dist VARCHAR,
agg_centroid VARCHAR
) RETURNS MADLIB_SCHEMA.kmeans_result
VOLATILE
LANGUAGE plpgsql
AS $$
DECLARE
ret MADLIB_SCHEMA.kmeans_result;
BEGIN
ret = MADLIB_SCHEMA.kmeans(
$1, $2, MADLIB_SCHEMA.kmeans_random_seeding($1, $2, $3),
$4, $5, 20, 0.001);
RETURN ret;
END
$$
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeans_random(
rel_source VARCHAR,
expr_point VARCHAR,
k INTEGER,
fn_dist VARCHAR
) RETURNS MADLIB_SCHEMA.kmeans_result
VOLATILE
LANGUAGE plpgsql
AS $$
DECLARE
ret MADLIB_SCHEMA.kmeans_result;
BEGIN
ret = MADLIB_SCHEMA.kmeans(
$1, $2,
MADLIB_SCHEMA.kmeans_random_seeding($1, $2, $3),
$4, 'MADLIB_SCHEMA.avg', 20, 0.001);
RETURN ret;
END
$$
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeans_random(
rel_source VARCHAR,
expr_point VARCHAR,
k INTEGER
) RETURNS MADLIB_SCHEMA.kmeans_result
VOLATILE
LANGUAGE plpgsql
AS $$
DECLARE
ret MADLIB_SCHEMA.kmeans_result;
BEGIN
ret = MADLIB_SCHEMA.kmeans(
$1, $2,
MADLIB_SCHEMA.kmeans_random_seeding($1, $2, $3),
'MADLIB_SCHEMA.squared_dist_norm2', 'MADLIB_SCHEMA.avg', 20, 0.001);
RETURN ret;
END
$$
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
/**
* @internal
* @brief Execute a SQL command where $1, ..., $6 are substituted with the
* given arguments.
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.internal_execute_using_kmeans_args(
sql VARCHAR, rel_source VARCHAR, expr_point VARCHAR,
fn_dist VARCHAR, agg_centroid VARCHAR, max_num_iterations INTEGER,
min_frac_reassigned DOUBLE PRECISION
) RETURNS MADLIB_SCHEMA.kmeans_result
VOLATILE
CALLED ON NULL INPUT
LANGUAGE c
AS 'MODULE_PATHNAME', 'exec_sql_using'
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
/**
* @brief Perform Lloyd's k-means local-search heuristic, but with initial
* centroids stored in a table
*
* This is a shortcut for running k-means with initial centroids stored in a
* table (as opposed to an array of centroids). It is equivalent
* to
* <pre>SELECT \ref kmeans(
rel_source,
expr_point,
(SELECT \ref matrix_agg($expr_centroid) FROM $rel_initial_centroids),
fn_dist,
agg_centroid,
max_num_iterations,
min_frac_reassigned
)</pre>
* where <tt>$expr_centroid</tt> and <tt>$rel_initial_centroids</tt> denote
* textual substituions.
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeans(
rel_source VARCHAR,
expr_point VARCHAR,
rel_initial_centroids VARCHAR,
expr_centroid VARCHAR,
fn_dist VARCHAR /*+ DEFAULT 'squared_dist_norm2' */,
agg_centroid VARCHAR /*+ DEFAULT 'avg' */,
max_num_iterations INTEGER /*+ DEFAULT 20 */,
min_frac_reassigned DOUBLE PRECISION /*+ DEFAULT 0.001 */
) RETURNS MADLIB_SCHEMA.kmeans_result
VOLATILE
LANGUAGE plpgsql
AS $$
DECLARE
class_rel_initial_centroids REGCLASS;
theResult MADLIB_SCHEMA.kmeans_result;
BEGIN
class_rel_initial_centroids := rel_initial_centroids;
SELECT * FROM MADLIB_SCHEMA.internal_execute_using_kmeans_args($sql$
SELECT MADLIB_SCHEMA.kmeans(
$1, $2,
(
SELECT MADLIB_SCHEMA.matrix_agg(($sql$ || expr_centroid || $sql$)::FLOAT8[])
FROM $sql$ || textin(regclassout(class_rel_initial_centroids))
|| $sql$
),
$3, $4, $5, $6)
$sql$,
rel_source, expr_point,
fn_dist, agg_centroid, max_num_iterations, min_frac_reassigned)
INTO theResult;
RETURN theResult;
END;
$$
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeans(
rel_source VARCHAR,
expr_point VARCHAR,
rel_initial_centroids VARCHAR,
expr_centroid VARCHAR,
fn_dist VARCHAR,
agg_centroid VARCHAR,
max_num_iterations INTEGER
) RETURNS MADLIB_SCHEMA.kmeans_result
VOLATILE
LANGUAGE sql AS $$
SELECT MADLIB_SCHEMA.kmeans(
$1, $2,
$3, $4, $5, $6, $7, 0.001)
$$
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeans(
rel_source VARCHAR,
expr_point VARCHAR,
rel_initial_centroids VARCHAR,
expr_centroid VARCHAR,
fn_dist VARCHAR,
agg_centroid VARCHAR
) RETURNS MADLIB_SCHEMA.kmeans_result
VOLATILE
LANGUAGE sql AS $$
SELECT MADLIB_SCHEMA.kmeans(
$1, $2,
$3, $4, $5, $6, 20, 0.001)
$$
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeans(
rel_source VARCHAR,
expr_point VARCHAR,
rel_initial_centroids VARCHAR,
expr_centroid VARCHAR,
fn_dist VARCHAR
) RETURNS MADLIB_SCHEMA.kmeans_result
VOLATILE
LANGUAGE sql AS $$
SELECT MADLIB_SCHEMA.kmeans(
$1, $2,
$3, $4, $5, 'MADLIB_SCHEMA.avg', 20, 0.001)
$$
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeans(
rel_source VARCHAR,
expr_point VARCHAR,
rel_initial_centroids VARCHAR,
expr_centroid VARCHAR
) RETURNS MADLIB_SCHEMA.kmeans_result
VOLATILE
LANGUAGE sql AS $$
SELECT MADLIB_SCHEMA.kmeans(
$1, $2,
$3, $4,
'MADLIB_SCHEMA.squared_dist_norm2', 'MADLIB_SCHEMA.avg', 20, 0.001)
$$
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
/**
* @internal
* @brief Execute a SQL command where $1, ..., $3 are substituted with the
* given arguments.
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.internal_execute_using_silhouette_args(
sql VARCHAR,
centroids DOUBLE PRECISION[][],
fn_dist REGPROC,
fn_dist_name VARCHAR
) RETURNS DOUBLE PRECISION
STABLE
CALLED ON NULL INPUT
LANGUAGE c
AS 'MODULE_PATHNAME', 'exec_sql_using'
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
/**
* @brief Compute a simplified version of the silhouette coefficient
*
* @param rel_source Name of the relation containing input points
* @param expr_point Expression evaluating to point coordinates \f$ x_i \f$ for
* each tuple
* @param centroids Matrix \f$ M = (\vec{m_0} \dots \vec{m_{k-1}})
* \in \mathbb{R}^{d \times k} \f$ with \f$ k \f$ columns, where column
* \f$ i \f$ contains the position of centroid \f$ i \f$.
* @param fn_dist Name of a function with signature
* <tt>DOUBLE PRECISION[] x DOUBLE PRECISION[] -> DOUBLE PRECISION</tt> that
* returns the distance between two points
* @return For each point \f$ x_i \f$, let
* \f$ d_1( x_i ) \f$ and \f$ d_2( x_i ) \f$ be the distance to the closest
* and 2nd-closest centroid, respectively. If there is more than one
* closest centroids then \f$ d_1( x_i ) = d_2( x_i )\f$.
* The return value is the average, over all points \f$ x_i \f$, of
* \f[
* \frac{d_2( x_i ) - d_1(x_i)}{d_2(x_i)},
* \f]
* where 0/0 is interpreted as 0.
* Clearly, the simplified silhouette coefficient assumes values in
* \f$ [0,1] \f$.
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.simple_silhouette(
rel_source VARCHAR,
expr_point VARCHAR,
centroids DOUBLE PRECISION[][],
fn_dist VARCHAR /*+ DEFAULT 'squared_dist_norm2' */
) RETURNS DOUBLE PRECISION
LANGUAGE plpgsql VOLATILE
AS $$
DECLARE
class_rel_source REGCLASS;
proc_fn_dist REGPROCEDURE;
rel_filtered VARCHAR;
ans DOUBLE PRECISION;
BEGIN
IF (array_upper(centroids,1) IS NULL) THEN
RAISE EXCEPTION 'Kmeans error: No valid centroids given.';
END IF;
PERFORM MADLIB_SCHEMA.__kmeans_validate_src(rel_source);
-- Validate the expr_point input. Since we don't need a view at this
-- point, the output is safe to ignore.
PERFORM MADLIB_SCHEMA.__kmeans_validate_expr(rel_source,expr_point);
class_rel_source := rel_source;
proc_fn_dist := fn_dist
|| '(DOUBLE PRECISION[], DOUBLE PRECISION[])';
IF (SELECT MADLIB_SCHEMA.is_pg_major_version_less_than(11) = TRUE) THEN
IF (SELECT prorettype != 'DOUBLE PRECISION'::regtype OR proisagg = TRUE
FROM pg_proc WHERE oid = proc_fn_dist) THEN
RAISE EXCEPTION 'Kmeans error: Distance function has wrong signature or is not a simple function.';
END IF;
ELSE
IF (SELECT prorettype != 'DOUBLE PRECISION'::regtype OR prokind = 'a'
FROM pg_proc WHERE oid = proc_fn_dist) THEN
RAISE EXCEPTION 'Kmeans error: Distance function has wrong signature or is not a simple function.';
END IF;
END IF;
ans := MADLIB_SCHEMA.internal_execute_using_silhouette_args($sql$
SELECT
avg(CASE
WHEN distances[2] = 0 THEN 0
ELSE (distances[2] - distances[1]) / distances[2]
END)
FROM (
SELECT
(MADLIB_SCHEMA._closest_columns(
$1,
($sql$ || expr_point || $sql$)::FLOAT8[],
2::INTEGER,
$2,
$3
)).distances
FROM
$sql$ || textin(regclassout(class_rel_source)) || $sql$
WHERE abs(coalesce(MADLIB_SCHEMA.svec_elsum($sql$ || expr_point || $sql$), 'Infinity'::FLOAT8)) < 'Infinity'::FLOAT8
) AS two_shortest_distances
$sql$,
centroids, proc_fn_dist, fn_dist);
RETURN ans;
END;
$$
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.simple_silhouette(
rel_source VARCHAR,
expr_point VARCHAR,
centroids DOUBLE PRECISION[][]
) RETURNS DOUBLE PRECISION
STABLE
LANGUAGE sql
AS $$
SELECT MADLIB_SCHEMA.simple_silhouette($1, $2, $3,
'MADLIB_SCHEMA.squared_dist_norm2')
$$
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
/**
* @brief Run auto k-Means.
*
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeanspp_auto(
rel_source VARCHAR,
output_table VARCHAR,
expr_point VARCHAR,
k INTEGER[],
fn_dist VARCHAR /*+ DEFAULT 'squared_dist_norm2' */,
agg_centroid VARCHAR /*+ DEFAULT 'avg' */,
max_num_iterations INTEGER /*+ DEFAULT 20 */,
min_frac_reassigned DOUBLE PRECISION /*+ DEFAULT 0.001 */,
seeding_sample_ratio DOUBLE PRECISION /*+ DEFAULT 1.0 */,
k_selection_algorithm VARCHAR /*+ DEFAULT 'silhouette' */
) RETURNS VOID AS $$
PythonFunction(`kmeans', `kmeans_auto', `kmeanspp_auto')
$$ LANGUAGE plpythonu VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeanspp_auto(
rel_source VARCHAR,
output_table VARCHAR,
expr_point VARCHAR,
k INTEGER[],
fn_dist VARCHAR /*+ DEFAULT 'squared_dist_norm2' */,
agg_centroid VARCHAR /*+ DEFAULT 'avg' */,
max_num_iterations INTEGER /*+ DEFAULT 20 */,
min_frac_reassigned DOUBLE PRECISION /*+ DEFAULT 0.001 */,
seeding_sample_ratio DOUBLE PRECISION /*+ DEFAULT 1.0 */
) RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.kmeanspp_auto($1, $2, $3, $4, $5, $6, $7, $8, $9, NULL)
$$ LANGUAGE sql VOLATILE
m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeanspp_auto(
rel_source VARCHAR,
output_table VARCHAR,
expr_point VARCHAR,
k INTEGER[],
fn_dist VARCHAR /*+ DEFAULT 'squared_dist_norm2' */,
agg_centroid VARCHAR /*+ DEFAULT 'avg' */,
max_num_iterations INTEGER /*+ DEFAULT 20 */,
min_frac_reassigned DOUBLE PRECISION /*+ DEFAULT 0.001 */
) RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.kmeanspp_auto($1, $2, $3, $4, $5, $6, $7, $8, NULL, NULL)
$$ LANGUAGE sql VOLATILE
m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeanspp_auto(
rel_source VARCHAR,
output_table VARCHAR,
expr_point VARCHAR,
k INTEGER[],
fn_dist VARCHAR /*+ DEFAULT 'squared_dist_norm2' */,
agg_centroid VARCHAR /*+ DEFAULT 'avg' */,
max_num_iterations INTEGER /*+ DEFAULT 20 */
) RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.kmeanspp_auto($1, $2, $3, $4, $5, $6, $7, NULL, NULL, NULL)
$$ LANGUAGE sql VOLATILE
m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeanspp_auto(
rel_source VARCHAR,
output_table VARCHAR,
expr_point VARCHAR,
k INTEGER[],
fn_dist VARCHAR /*+ DEFAULT 'squared_dist_norm2' */,
agg_centroid VARCHAR /*+ DEFAULT 'avg' */
) RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.kmeanspp_auto($1, $2, $3, $4, $5, $6, NULL, NULL, NULL)
$$ LANGUAGE sql VOLATILE
m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeanspp_auto(
rel_source VARCHAR,
output_table VARCHAR,
expr_point VARCHAR,
k INTEGER[],
fn_dist VARCHAR /*+ DEFAULT 'squared_dist_norm2' */
) RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.kmeanspp_auto($1, $2, $3, $4, $5, NULL, NULL, NULL, NULL, NULL)
$$ LANGUAGE sql VOLATILE
m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeanspp_auto(
rel_source VARCHAR,
output_table VARCHAR,
expr_point VARCHAR,
k INTEGER[]
) RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.kmeanspp_auto($1, $2, $3, $4, NULL, NULL, NULL, NULL, NULL, NULL)
$$ LANGUAGE sql VOLATILE
m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeans_random_auto(
rel_source VARCHAR,
output_table VARCHAR,
expr_point VARCHAR,
k INTEGER[],
fn_dist VARCHAR /*+ DEFAULT 'squared_dist_norm2' */,
agg_centroid VARCHAR /*+ DEFAULT 'avg' */,
max_num_iterations INTEGER /*+ DEFAULT 20 */,
min_frac_reassigned DOUBLE PRECISION /*+ DEFAULT 0.001 */,
k_selection_algorithm VARCHAR /*+ DEFAULT 'silhouette' */
) RETURNS VOID AS $$
PythonFunction(`kmeans', `kmeans_auto', `kmeans_random_auto')
$$ LANGUAGE plpythonu VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeans_random_auto(
rel_source VARCHAR,
output_table VARCHAR,
expr_point VARCHAR,
k INTEGER[],
fn_dist VARCHAR /*+ DEFAULT 'squared_dist_norm2' */,
agg_centroid VARCHAR /*+ DEFAULT 'avg' */,
max_num_iterations INTEGER /*+ DEFAULT 20 */,
min_frac_reassigned DOUBLE PRECISION /*+ DEFAULT 0.001 */
) RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.kmeans_random_auto($1, $2, $3, $4, $5, $6, $7, $8, NULL)
$$ LANGUAGE sql VOLATILE
m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeans_random_auto(
rel_source VARCHAR,
output_table VARCHAR,
expr_point VARCHAR,
k INTEGER[],
fn_dist VARCHAR /*+ DEFAULT 'squared_dist_norm2' */,
agg_centroid VARCHAR /*+ DEFAULT 'avg' */,
max_num_iterations INTEGER /*+ DEFAULT 20 */
) RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.kmeans_random_auto($1, $2, $3, $4, $5, $6, $7, NULL, NULL)
$$ LANGUAGE sql VOLATILE
m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeans_random_auto(
rel_source VARCHAR,
output_table VARCHAR,
expr_point VARCHAR,
k INTEGER[],
fn_dist VARCHAR /*+ DEFAULT 'squared_dist_norm2' */,
agg_centroid VARCHAR /*+ DEFAULT 'avg' */
) RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.kmeans_random_auto($1, $2, $3, $4, $5, $6, NULL, NULL)
$$ LANGUAGE sql VOLATILE
m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeans_random_auto(
rel_source VARCHAR,
output_table VARCHAR,
expr_point VARCHAR,
k INTEGER[],
fn_dist VARCHAR /*+ DEFAULT 'squared_dist_norm2' */
) RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.kmeans_random_auto($1, $2, $3, $4, $5, NULL, NULL, NULL, NULL)
$$ LANGUAGE sql VOLATILE
m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeans_random_auto(
rel_source VARCHAR,
output_table VARCHAR,
expr_point VARCHAR,
k INTEGER[]
) RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.kmeans_random_auto($1, $2, $3, $4, NULL, NULL, NULL, NULL, NULL)
$$ LANGUAGE sql VOLATILE
m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.simple_silhouette_points(
rel_source VARCHAR,
output_table VARCHAR,
pid VARCHAR,
expr_point VARCHAR,
centroids_table VARCHAR,
centroids_col VARCHAR,
fn_dist VARCHAR /*+ DEFAULT 'squared_dist_norm2' */
) RETURNS VOID AS $$
PythonFunction(kmeans, kmeans, simple_silhouette_points_str_wrapper)
$$ LANGUAGE plpythonu VOLATILE
m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.simple_silhouette_points(
rel_source VARCHAR,
output_table VARCHAR,
pid VARCHAR,
expr_point VARCHAR,
centroids_table VARCHAR,
centroids_col VARCHAR
) RETURNS VOID
AS $$
SELECT MADLIB_SCHEMA.simple_silhouette_points($1, $2, $3, $4, $5, $6,
'MADLIB_SCHEMA.squared_dist_norm2')
$$ LANGUAGE sql VOLATILE
m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.simple_silhouette_points(
rel_source VARCHAR,
output_table VARCHAR,
pid VARCHAR,
expr_point VARCHAR,
centroids DOUBLE PRECISION[],
fn_dist VARCHAR /*+ DEFAULT 'squared_dist_norm2' */
) RETURNS VOID AS $$
PythonFunction(kmeans, kmeans, simple_silhouette_points_dbl_wrapper)
$$ LANGUAGE plpythonu VOLATILE
m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.simple_silhouette_points(
rel_source VARCHAR,
output_table VARCHAR,
pid VARCHAR,
expr_point VARCHAR,
centroids DOUBLE PRECISION[]
) RETURNS VOID
AS $$
SELECT MADLIB_SCHEMA.simple_silhouette_points($1, $2, $3, $4, $5,
'MADLIB_SCHEMA.squared_dist_norm2')
$$ LANGUAGE sql VOLATILE
m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');