| /* ----------------------------------------------------------------------- *//** |
| * |
| * @file kmeans.sql_in |
| * |
| * @brief Set of functions for k-means clustering. |
| * |
| * @sa For a brief introduction to k-means clustering, see the module |
| * description \ref grp_kmeans. |
| * |
| *//* ----------------------------------------------------------------------- */ |
| |
| m4_include(`SQLCommon.m4') |
| |
| /** |
| @addtogroup grp_kmeans |
| |
| <div class="toc"><b>Contents</b> |
| <ul> |
| <li class="level1"><a href="#cluster">Clustering Function</a></li> |
| <li class="level1"><a href="#auto_cluster">Auto Clustering Function</a></li> |
| <li class="level1"><a href="#assignment">Cluster Assignment</a></li> |
| <li class="level1"><a href="#silh">Simple Silhouette</a></li> |
| <li class="level1"><a href="#examples">Examples</a></li> |
| <li class="level1"><a href="#background">Technical Background</a></li> |
| <li class="level1"><a href="#literature">Literature</a></li> |
| <li class="level1"><a href="#related">Related Topics</a></li> |
| </ul> |
| </div> |
| |
| @brief Partitions a set of observations into clusters by finding centroids that |
| minimize the sum of observations' distances from their closest centroid. |
| |
| Clustering refers to the problem of partitioning a set of objects according to |
| some problem-dependent measure of <em>similarity</em>. In the k-means variant, |
| given \f$ n \f$ points \f$ x_1, \dots, x_n \in \mathbb R^d \f$, the |
| goal is to position \f$ k \f$ centroids \f$ c_1, \dots, c_k \in \mathbb R^d \f$ |
| so that the sum of \em distances between each point and its closest centroid is |
| minimized. Each centroid represents a cluster that consists of all points to |
| which this centroid is closest. |
| |
| This module can compute clusters given the number of centroids <em>k</em> as an input, |
| using a variety of seeding methods. |
| It can also automatically select the best <em>k</em> value from a range |
| of suggested <em>k</em> values, using the simplified silhouette method |
| or the elbow method. |
| |
| @anchor cluster |
| @par Clustering Function |
| |
| The k-means algorithm can be invoked in different ways, depending on the |
| source of the initial set of centroids: |
| |
| - Uniform random centroid seeding method. |
| <pre class="syntax"> |
| kmeans_random( rel_source, |
| expr_point, |
| k, |
| fn_dist, |
| agg_centroid, |
| max_num_iterations, |
| min_frac_reassigned |
| ) |
| </pre> |
| |
| - k-means++ centroid seeding method <a href="#kmeans-lit-1">[1]</a>. |
| This method can speed up convergence |
| by seeding centroids spread out over the whole range of the input |
| points, while at the same time not being too susceptible to outliers. |
| <pre class="syntax"> |
| kmeanspp( rel_source, |
| expr_point, |
| k, |
| fn_dist, |
| agg_centroid, |
| max_num_iterations, |
| min_frac_reassigned, |
| seeding_sample_ratio |
| ) |
| </pre> |
| |
| - Supply an initial centroid set in a relation identified |
| by the \e rel_initial_centroids argument, for the case |
| where initial centroids are stored in a table. |
| <pre class="syntax"> |
| kmeans( rel_source, |
| expr_point, |
| rel_initial_centroids, |
| expr_centroid, |
| fn_dist, |
| agg_centroid, |
| max_num_iterations, |
| min_frac_reassigned |
| ) |
| </pre> |
| |
| - Provide an initial centroid set as an array expression |
| in the \e initial_centroids argument. |
| <pre class="syntax"> |
| kmeans( rel_source, |
| expr_point, |
| initial_centroids, |
| fn_dist, |
| agg_centroid, |
| max_num_iterations, |
| min_frac_reassigned |
| ) |
| </pre> |
| |
| \b Arguments |
| <dl class="arglist"> |
| <dt>rel_source</dt> |
| <dd>TEXT. The name of the table containing the input data points. Data points and |
| predefined centroids (if used) are expected to be stored row-wise, |
| in a column of type <tt>\ref grp_svec "SVEC"</tt> (or any type convertible to |
| <tt>\ref grp_svec "SVEC"</tt>, like <tt>FLOAT[]</tt> or <tt>INTEGER[]</tt>). |
| Data points with non-finite values (NULL, NaN, infinity) in any component |
| are skipped during analysis. |
| </dd> |
| |
| <dt>expr_point</dt> |
| <dd>TEXT. The name of the column with point coordinates or an array expression.</dd> |
| |
| <dt>k</dt> |
| <dd>INTEGER. The number of centroids to calculate.</dd> |
| |
| <dt>fn_dist (optional)</dt> |
| <dd>TEXT, default: 'squared_dist_norm2'. The name of the function |
| to use to calculate the distance from a data point vector to a centroid vector. |
| The following distance functions can be used (computation of barycenter/mean in parentheses): |
| <ul> |
| <li><b>\ref dist_norm1</b>: 1-norm/Manhattan (element-wise median). |
| MADlib does not provide a median aggregate function for |
| performance reasons.</li> |
| <li><b>\ref dist_norm2</b>: 2-norm/Euclidean (element-wise mean)</li> |
| <li><b>\ref squared_dist_norm2</b>: squared Euclidean distance (element-wise mean)</li> |
| <li><b>\ref dist_angle</b>: angle (element-wise mean of normalized points)</li> |
| <li><b>\ref dist_tanimoto</b>: tanimoto (element-wise mean of normalized points <a href="#kmeans-lit-5">[5]</a>)</li> |
| <li><b>user defined function</b> with signature <tt>DOUBLE PRECISION[] x, DOUBLE PRECISION[] y -> DOUBLE PRECISION</tt></li></ul></dd> |
| |
| <dt>agg_centroid (optional)</dt> |
| <dd>TEXT, default: 'avg'. The name of the aggregate function used to determine centroids. |
| The following aggregate functions can be used:<ul> |
| <li><b>\ref avg</b>: average (default)</li> |
| <li><b>\ref normalized_avg</b>: normalized average</li></ul></dd> |
| |
| <dt>max_num_iterations (optional)</dt> |
| <dd>INTEGER, default: 20. The maximum number of iterations to perform.</dd> |
| |
| <dt>min_frac_reassigned (optional)</dt> |
| <dd>DOUBLE PRECISION, default: 0.001. The minimum fraction of centroids reassigned to continue iterating. |
| When fewer than this fraction of centroids are reassigned in an iteration, the calculation completes. |
| |
| <dt>seeding_sample_ratio (optional)</dt> |
| <dd>DOUBLE PRECISION, default: 1.0. The proportion of subsample of original dataset |
| to use for kmeans++ centroid seeding method. Kmeans++ scans through the data |
| sequentially 'k' times and can be too slow for big datasets. When |
| 'seeding_sample_ratio' is greater than 0 (thresholded to be maximum value of 1.0), |
| the seeding is run on a uniform random subsample of the data. |
| Note: the final K-means algorithm is run on the complete dataset. This parameter |
| only builds a subsample for the seeding and is only available for kmeans++. |
| |
| <dt>rel_initial_centroids</dt> |
| <dd>TEXT. Table or view containing the set of initial centroids. |
| </dd> |
| |
| <dt>expr_centroid</dt> |
| <dd>TEXT. The name of the column (or the array expression) |
| in the <em>rel_initial_centroids</em> table or view that contains |
| the centroid coordinates.</dd> |
| |
| <dt>initial_centroids</dt> |
| <dd>DOUBLE PRECISION[][]. Array expression |
| with the initial centroid coordinates.</dd> |
| </dl> |
| |
| <b>Output</b> |
| <br> |
| The output of the k-means module is a composite type with |
| the following columns: |
| <table class="output"> |
| <tr> |
| <th>centroids</th> |
| <td>DOUBLE PRECISION[][]. The final centroid positions.</td> |
| </tr> |
| <tr> |
| <th>cluster_variance</th> |
| <td>DOUBLE PRECISION[]. The value of the objective function per cluster.</td> |
| </tr> |
| <tr> |
| <th>objective_fn</th> |
| <td>DOUBLE PRECISION. The value of the objective function.</td> |
| </tr> |
| <tr> |
| <th>frac_reassigned</th> |
| <td>DOUBLE PRECISION. The fraction of points reassigned in the last iteration.</td> |
| </tr> |
| <tr> |
| <th>num_iterations</th> |
| <td>INTEGER. The total number of iterations executed.</td> |
| </tr> |
| </table> |
| |
| @anchor auto_cluster |
| @par Auto Clustering Function |
| |
| The optimal number of centroids can be determined automatically, |
| from a set of candidate values that you provide, for |
| random seeding or <em>k-means++</em> seeding. The <a href="#simplified_silhouette">simplified |
| silhouette method</a> or the <a href="#elbow">elbow method</a> are used to pick the best <em>k</em> value. |
| |
| - Uniform random centroid seeding method. |
| <pre class="syntax"> |
| kmeans_random_auto( rel_source, |
| output_table, |
| expr_point, |
| k, |
| fn_dist, |
| agg_centroid, |
| max_num_iterations, |
| min_frac_reassigned, |
| k_selection_algorithm |
| ) |
| </pre> |
| |
| - k-means++ centroid seeding method. |
| <pre class="syntax"> |
| kmeanspp_auto( rel_source, |
| output_table, |
| expr_point, |
| k, |
| fn_dist, |
| agg_centroid, |
| max_num_iterations, |
| min_frac_reassigned, |
| seeding_sample_ratio, |
| k_selection_algorithm |
| ) |
| </pre> |
| |
| <b>Arguments</b> |
| <br> |
| The arguments for auto k selection are the same as described above, |
| with the following additions: |
| <dl class="arglist"> |
| |
| <dt>output_table</dt> |
| <dd>TEXT. Name of the output table containing results for each k |
| value. Details of the output table are shown below. |
| A summary table called <output_table>_summary will also be |
| created for the best k value as per the selection algorithm.</dd> |
| |
| <dt>k</dt> |
| <dd>INTEGER[]. Array of k values to test. Does not need to be |
| contiguous but all elements must be >1 and cannot repeat within the array. |
| Parameter 'k_selection_algorithm' |
| determines the evaluation method.</dd> |
| |
| <dt>k_selection_algorithm (optional)</dt> |
| <dd>TEXT, default: 'silhouette'. Method to evaluate optimal number of centroids k. |
| Current approaches supported: 'silhouette', 'elbow' or 'both'. |
| The text can be any subset of the strings; for e.g., 'silh' will |
| use the silhouette method. |
| Note that for large data sets, the silhouette computation |
| can be expensive.</dd> |
| |
| @note |
| Note that the auto k-means algorithms require the NumPy python package to be |
| installed on the system since the elbow method uses the NumPy gradient function <a href="#kmeans-lit-2">[2]</a>. |
| For Greenplum clusters, installing NumPy only on the |
| host machine of the master segment will be sufficient. The suggested |
| installation method is: <em>pip install numpy --user</em> |
| </dl> |
| |
| <b>Output Tables</b> |
| <br> |
| Two output tables are created for auto k-means. |
| The first is called 'output_table' and contains one row |
| per k value: |
| <table class="output"> |
| <tr> |
| <th>k</th> |
| <td>INTEGER. Number of centroids.</td> |
| </tr> |
| <tr> |
| <th>centroids</th> |
| <td>DOUBLE PRECISION[][]. The final centroid positions.</td> |
| </tr> |
| <tr> |
| <th>cluster_variance</th> |
| <td>DOUBLE PRECISION[]. The value of the objective function per cluster.</td> |
| </tr> |
| <tr> |
| <th>objective_fn</th> |
| <td>DOUBLE PRECISION. The value of the objective function.</td> |
| </tr> |
| <tr> |
| <th>frac_reassigned</th> |
| <td>DOUBLE PRECISION. The fraction of points reassigned in the last iteration.</td> |
| </tr> |
| <tr> |
| <th>num_iterations</th> |
| <td>INTEGER. The total number of iterations executed.</td> |
| </tr> |
| <tr> |
| <th>k</th> |
| <td>INTEGER. Number of centroids as per the |
| specified 'k_selection_algorithm'. If 'both' is specified, |
| the best k value will correspond to the silhouette method.</td> |
| </tr> |
| <tr> |
| <th>silhouette</th> |
| <td>DOUBLE PRECISION. The value of the simplified silhouette |
| score for the k value, if computed.</td> |
| </tr> |
| <tr> |
| <th>elbow</th> |
| <td>DOUBLE PRECISION. The value of the elbow |
| score for the k value, if computed.</td> |
| </tr> |
| <tr> |
| <th>selection_algorithm</th> |
| <td>TEXT. Algorithm used to select the best |
| k (either 'silhouette' or 'elbow')</td> |
| </tr> |
| </table> |
| </dl> |
| |
| A summary table named <output_table>_summary is also created, which has the same output as the |
| 'output_table' above but only contains one row for best k value as per the selection algorithm. |
| If 'both' is specified for 'k_selection_algorithm' |
| the best k value returned will correspond to the silhouette method. |
| |
| @anchor assignment |
| @par Cluster Assignment |
| |
| After clustering, the cluster assignment for each data point can be computed with |
| the help of the closest_column() utility function: |
| |
| <pre class="syntax"> |
| closest_column( m, |
| x, |
| dist |
| ) |
| </pre> |
| |
| <b>Arguments</b> |
| <dl class="arglist"> |
| <dt>m</dt> |
| <dd>DOUBLE PRECISION[][]. Learned centroids from the training function.</dd> |
| |
| <dt>x</dt> |
| <dd>DOUBLE PRECISION[]. Data points.</dd> |
| |
| <dt>dist (optional)</dt> |
| <dd>TEXT, default: 'squared_dist_norm2'. The name of the function |
| to use to calculate the distance from a data point vector to a centroid vector. |
| See the \e fn_dist argument of the k-means training function for more details on distance functions.</dd> |
| </dl> |
| |
| <b>Output</b> |
| <br> |
| The output is a composite type with |
| the following columns: |
| <table class="output"> |
| <tr> |
| <th>column_id</th> |
| <td>INTEGER. Cluster assignment (zero-based, i.e., 0,1,2...).</td> |
| </tr> |
| <tr> |
| <th>distance</th> |
| <td>DOUBLE PRECISION. Distance to the cluster centroid.</td> |
| </tr> |
| </table> |
| |
| @anchor silh |
| @par Simple Silhouette |
| |
| A common method to assess the quality of the clustering is the <em>silhouette |
| coefficient</em>, a simplified version of which is |
| implemented in MADlib <a href="#kmeans-lit-3">[3]</a>. |
| There are two silhouette functions: average score across all data points, |
| and a per-point score. The average silhouette function has the following syntax: |
| <pre class="syntax"> |
| simple_silhouette( rel_source, |
| expr_point, |
| centroids, |
| fn_dist |
| ) |
| </pre> |
| |
| The per-point silhouette function has two formats. The first |
| takes an array of centroids: |
| <pre class="syntax"> |
| simple_silhouette_points(rel_source, |
| output_table, |
| pid, |
| expr_point, |
| centroids, |
| fn_dist |
| ) |
| </pre> |
| |
| The second takes a centroids column from a table: |
| <pre class="syntax"> |
| simple_silhouette_points(rel_source, |
| output_table, |
| pid, |
| expr_point, |
| centroids_table, |
| centroids_col, |
| fn_dist |
| ) |
| </pre> |
| |
| \b Arguments |
| <dl class="arglist"> |
| <dt>rel_source</dt> |
| <dd>TEXT. The name of the table containing the input data points.</dd> |
| |
| <dt>expr_point</dt> |
| <dd>TEXT. The name of the column with point coordinates or an array expression.</dd> |
| |
| <dt>centroids</dt> |
| <dd>DOUBLE PRECISION[][]. An expression evaluating to an array of centroids. </dd> |
| |
| <dt>fn_dist (optional)</dt> |
| <dd>TEXT, default: 'squared_dist_norm2'. The name of the function |
| to use to calculate the distance from a data point vector to a centroid vector. |
| See the \e fn_dist argument of the k-means training function for more details on distance functions.</dd> |
| |
| <dt>output_table</dt> |
| <dd>TEXT. Name of the output table to contain sihouette score |
| for each point.</dd> |
| |
| <dt>pid</dt> |
| <dd>TEXT. Column containing point ID in the |
| table 'rel_source' containing the data points. </dd> |
| |
| <dt>centroids_table</dt> |
| <dd>TEXT. Name of the table containing the centroids.</dd> |
| |
| <dt>centroids_col</dt> |
| <dd>TEXT. Name of the column in the table 'centroids_table' |
| containing the centroids.</dd> |
| |
| </dl> |
| |
| <b>Output</b> |
| <br> |
| For the average function, a single value for simple silhouette score is returned. |
| For the per-point function, the output table contains |
| the following columns: |
| <table class="output"> |
| <tr> |
| <th>pid</th> |
| <td>INTEGER. Point ID.</td> |
| </tr> |
| |
| <tr> |
| <th>centroid_id</th> |
| <td>INTEGER. The cluster that the point is assigned to.</td> |
| </tr> |
| |
| <tr> |
| <th>neighbor_centroid_id</th> |
| <td>INTEGER. The next closest cluster to the one that the point is assigned to.</td> |
| </tr> |
| |
| <tr> |
| <th>simple_silhouette</th> |
| <td>DOUBLE PRECISION. Simplified silhouette score for the point.</td> |
| </tr> |
| </table> |
| |
| @anchor examples |
| @examp |
| |
| @note |
| Your results may not be exactly the same as below due to the nature random |
| nature of the k-means algorithm. Also, remember to be consistent in the distance |
| functions that you use in the k-means, silhouette and helper functions. |
| |
| <h4>Clustering for Single <em>k</em> Value</h4> |
| |
| -# Create input data: |
| <pre class="example"> |
| DROP TABLE IF EXISTS km_sample; |
| CREATE TABLE km_sample(pid int, points double precision[]); |
| INSERT INTO km_sample VALUES |
| (1, '{14.23, 1.71, 2.43, 15.6, 127, 2.8, 3.0600, 0.2800, 2.29, 5.64, 1.04, 3.92, 1065}'), |
| (2, '{13.2, 1.78, 2.14, 11.2, 1, 2.65, 2.76, 0.26, 1.28, 4.38, 1.05, 3.49, 1050}'), |
| (3, '{13.16, 2.36, 2.67, 18.6, 101, 2.8, 3.24, 0.3, 2.81, 5.6799, 1.03, 3.17, 1185}'), |
| (4, '{14.37, 1.95, 2.5, 16.8, 113, 3.85, 3.49, 0.24, 2.18, 7.8, 0.86, 3.45, 1480}'), |
| (5, '{13.24, 2.59, 2.87, 21, 118, 2.8, 2.69, 0.39, 1.82, 4.32, 1.04, 2.93, 735}'), |
| (6, '{14.2, 1.76, 2.45, 15.2, 112, 3.27, 3.39, 0.34, 1.97, 6.75, 1.05, 2.85, 1450}'), |
| (7, '{14.39, 1.87, 2.45, 14.6, 96, 2.5, 2.52, 0.3, 1.98, 5.25, 1.02, 3.58, 1290}'), |
| (8, '{14.06, 2.15, 2.61, 17.6, 121, 2.6, 2.51, 0.31, 1.25, 5.05, 1.06, 3.58, 1295}'), |
| (9, '{14.83, 1.64, 2.17, 14, 97, 2.8, 2.98, 0.29, 1.98, 5.2, 1.08, 2.85, 1045}'), |
| (10, '{13.86, 1.35, 2.27, 16, 98, 2.98, 3.15, 0.22, 1.8500, 7.2199, 1.01, 3.55, 1045}'); |
| </pre> |
| -# Run k-means clustering using kmeans++ for centroid seeding. Use squared Euclidean |
| distance which is a commonly used distance function. |
| <pre class="example"> |
| DROP TABLE IF EXISTS km_result; |
| CREATE TABLE km_result AS |
| SELECT * FROM madlib.kmeanspp( 'km_sample', -- Table of source data |
| 'points', -- Column containing point co-ordinates |
| 2, -- Number of centroids to calculate |
| 'madlib.squared_dist_norm2', -- Distance function |
| 'madlib.avg', -- Aggregate function |
| 20, -- Number of iterations |
| 0.001 -- Fraction of centroids reassigned to keep iterating |
| ); |
| \\x on; |
| SELECT * FROM km_result; |
| </pre> |
| <pre class="result"> |
| -[ RECORD 1 ]----+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| centroids | {{14.255,1.9325,2.5025,16.05,110.5,3.055,2.9775,0.2975,1.845,6.2125,0.9975,3.365,1378.75},{13.7533333333333,1.905,2.425,16.0666666666667,90.3333333333333,2.805,2.98,0.29,2.005,5.40663333333333,1.04166666666667,3.31833333333333,1020.83333333333}} |
| cluster_variance | {30561.74805,122999.110416013} |
| objective_fn | 153560.858466013 |
| frac_reassigned | 0 |
| num_iterations | 2 |
| </pre> |
| -# Simplified silhouette coefficient. First find average |
| for whole data set. Make sure to use the same distance function as k-means above. |
| <pre class="example"> |
| SELECT * FROM madlib.simple_silhouette( 'km_sample', -- Input points table |
| 'points', -- Column containing points |
| (SELECT centroids FROM km_result), -- Centroids |
| 'madlib.squared_dist_norm2' -- Distance function |
| ); |
| </pre> |
| <pre class="result"> |
| -[ RECORD 1 ]-----+------------------ |
| simple_silhouette | 0.868174608939623 |
| </pre> |
| Now calculate simplified silhouette coefficient for each point in the data set: |
| <pre class="example"> |
| DROP TABLE IF EXISTS km_points_silh; |
| SELECT * FROM madlib.simple_silhouette_points( 'km_sample', -- Input points table |
| 'km_points_silh', -- Output table |
| 'pid', -- Point ID column in input table |
| 'points', -- Points column in input table |
| 'km_result', -- Centroids table |
| 'centroids', -- Column in centroids table containing centroids |
| 'madlib.squared_dist_norm2' -- Distance function |
| ); |
| SELECT * FROM km_points_silh ORDER BY pid; |
| </pre> |
| <pre class="result"> |
| pid | centroid_id | neighbor_centroid_id | silh |
| -----+-------------+----------------------+------------------- |
| 1 | 1 | 0 | 0.966608819821713 |
| 2 | 1 | 0 | 0.926251125077039 |
| 3 | 1 | 0 | 0.28073008848306 |
| 4 | 0 | 1 | 0.951447083910869 |
| 5 | 1 | 0 | 0.80098239014753 |
| 6 | 0 | 1 | 0.972487557020722 |
| 7 | 0 | 1 | 0.88838568723116 |
| 8 | 0 | 1 | 0.906334719972002 |
| 9 | 1 | 0 | 0.994315140692314 |
| 10 | 1 | 0 | 0.99420347703982 |
| (10 rows) |
| </pre> |
| |
| -# Find the cluster assignment for each point. |
| Use the closest_column() function to map each point to the cluster that it belongs to. |
| <pre class="example"> |
| \\x off; |
| DROP TABLE IF EXISTS point_cluster_map; |
| CREATE TABLE point_cluster_map AS |
| SELECT data.*, (madlib.closest_column(centroids, points, 'madlib.squared_dist_norm2')).* |
| FROM km_sample as data, km_result; |
| ALTER TABLE point_cluster_map RENAME column_id to cluster_id; -- change column name |
| SELECT * FROM point_cluster_map ORDER BY pid; |
| </pre> |
| <pre class="result"> |
| pid | points | cluster_id | distance |
| -----+--------------------------------------------------------------------+------------+------------------ |
| 1 | {14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065} | 1 | 3296.12614333444 |
| 2 | {13.2,1.78,2.14,11.2,1,2.65,2.76,0.26,1.28,4.38,1.05,3.49,1050} | 1 | 8856.90882600111 |
| 3 | {13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.6799,1.03,3.17,1185} | 1 | 27072.3216580044 |
| 4 | {14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480} | 0 | 10261.9450375 |
| 5 | {13.24,2.59,2.87,21,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735} | 1 | 82492.8673553345 |
| 6 | {14.2,1.76,2.45,15.2,112,3.27,3.39,0.34,1.97,6.75,1.05,2.85,1450} | 0 | 5080.3612375 |
| 7 | {14.39,1.87,2.45,14.6,96,2.5,2.52,0.3,1.98,5.25,1.02,3.58,1290} | 0 | 8090.4485875 |
| 8 | {14.06,2.15,2.61,17.6,121,2.6,2.51,0.31,1.25,5.05,1.06,3.58,1295} | 0 | 7128.9931875 |
| 9 | {14.83,1.64,2.17,14,97,2.8,2.98,0.29,1.98,5.2,1.08,2.85,1045} | 1 | 634.301947334443 |
| 10 | {13.86,1.35,2.27,16,98,2.98,3.15,0.22,1.85,7.2199,1.01,3.55,1045} | 1 | 646.584486004443 |
| (10 rows) |
| </pre> |
| |
| -# Display cluster ID. There are two steps to get the cluster id associated |
| with the centroid coordinates, if you need it. First unnest the cluster |
| centroids 2-D array to get a set of 1-D centroid arrays: |
| <pre class="example"> |
| DROP TABLE IF EXISTS km_centroids_unnest; |
| -- Run unnest function |
| CREATE TABLE km_centroids_unnest AS |
| SELECT (madlib.array_unnest_2d_to_1d(centroids)).* |
| FROM km_result; |
| SELECT * FROM km_centroids_unnest ORDER BY 1; |
| </pre> |
| <pre class="result"> |
| unnest_row_id | unnest_result |
| ---------------+------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| 1 | {14.255,1.9325,2.5025,16.05,110.5,3.055,2.9775,0.2975,1.845,6.2125,0.9975,3.365,1378.75} |
| 2 | {13.7533333333333,1.905,2.425,16.0666666666667,90.3333333333333,2.805,2.98,0.29,2.005,5.40663333333333,1.04166666666667,3.31833333333333,1020.83333333333} |
| (2 rows) |
| </pre> |
| Note that the ID column returned by array_unnest_2d_to_1d() is just a row ID and not the cluster ID assigned by k-means. The second step to get the cluster_id is: |
| <pre class="example"> |
| SELECT cent.*, (madlib.closest_column(centroids, unnest_result, 'madlib.squared_dist_norm2')).column_id as cluster_id |
| FROM km_centroids_unnest as cent, km_result |
| ORDER BY cent.unnest_row_id; |
| </pre> |
| <pre class="result"> |
| unnest_row_id | unnest_result | cluster_id |
| ---------------+------------------------------------------------------------------------------------------------------------------------------------------------------------+------------ |
| 1 | {14.255,1.9325,2.5025,16.05,110.5,3.055,2.9775,0.2975,1.845,6.2125,0.9975,3.365,1378.75} | 0 |
| 2 | {13.7533333333333,1.905,2.425,16.0666666666667,90.3333333333333,2.805,2.98,0.29,2.005,5.40663333333333,1.04166666666667,3.31833333333333,1020.83333333333} | 1 |
| (2 rows) |
| </pre> |
| |
| -# Run the same example as above, but using array input. Create the input table: |
| <pre class="example"> |
| DROP TABLE IF EXISTS km_arrayin CASCADE; |
| CREATE TABLE km_arrayin(pid int, |
| p1 float, |
| p2 float, |
| p3 float, |
| p4 float, |
| p5 float, |
| p6 float, |
| p7 float, |
| p8 float, |
| p9 float, |
| p10 float, |
| p11 float, |
| p12 float, |
| p13 float); |
| INSERT INTO km_arrayin VALUES |
| (1, 14.23, 1.71, 2.43, 15.6, 127, 2.8, 3.0600, 0.2800, 2.29, 5.64, 1.04, 3.92, 1065), |
| (2, 13.2, 1.78, 2.14, 11.2, 1, 2.65, 2.76, 0.26, 1.28, 4.38, 1.05, 3.49, 1050), |
| (3, 13.16, 2.36, 2.67, 18.6, 101, 2.8, 3.24, 0.3, 2.81, 5.6799, 1.03, 3.17, 1185), |
| (4, 14.37, 1.95, 2.5, 16.8, 113, 3.85, 3.49, 0.24, 2.18, 7.8, 0.86, 3.45, 1480), |
| (5, 13.24, 2.59, 2.87, 21, 118, 2.8, 2.69, 0.39, 1.82, 4.32, 1.04, 2.93, 735), |
| (6, 14.2, 1.76, 2.45, 15.2, 112, 3.27, 3.39, 0.34, 1.97, 6.75, 1.05, 2.85, 1450), |
| (7, 14.39, 1.87, 2.45, 14.6, 96, 2.5, 2.52, 0.3, 1.98, 5.25, 1.02, 3.58, 1290), |
| (8, 14.06, 2.15, 2.61, 17.6, 121, 2.6, 2.51, 0.31, 1.25, 5.05, 1.06, 3.58, 1295), |
| (9, 14.83, 1.64, 2.17, 14, 97, 2.8, 2.98, 0.29, 1.98, 5.2, 1.08, 2.85, 1045), |
| (10, 13.86, 1.35, 2.27, 16, 98, 2.98, 3.15, 0.22, 1.8500, 7.2199, 1.01, 3.55, 1045); |
| </pre> |
| Now find the cluster assignment for each point using random seeding: |
| <pre class="example"> |
| DROP TABLE IF EXISTS km_result_array; |
| -- Run kmeans algorithm |
| CREATE TABLE km_result_array AS |
| SELECT * FROM madlib.kmeans_random('km_arrayin', -- Table of source data |
| 'ARRAY[p1, p2, p3, p4, p5, p6, -- Points |
| p7, p8, p9, p10, p11, p12, p13]', |
| 2, -- Number of centroids to calculate |
| 'madlib.squared_dist_norm2', -- Distance function |
| 'madlib.avg', -- Aggregate function |
| 20, -- Number of iterations |
| 0.001); -- Fraction of centroids reassigned to keep iterating |
| -- Get point assignment |
| SELECT data.*, (madlib.closest_column(centroids, |
| ARRAY[p1, p2, p3, p4, p5, p6, |
| p7, p8, p9, p10, p11, p12, p13], |
| 'madlib.squared_dist_norm2')).column_id as cluster_id |
| FROM km_arrayin as data, km_result_array |
| ORDER BY data.pid; |
| </pre> |
| This produces the result in column format: |
| <pre class="result"> |
| pid | p1 | p2 | p3 | p4 | p5 | p6 | p7 | p8 | p9 | p10 | p11 | p12 | p13 | cluster_id |
| -----+-------+------+------+------+-----+------+------+------+------+--------+------+------+------+------------ |
| 1 | 14.23 | 1.71 | 2.43 | 15.6 | 127 | 2.8 | 3.06 | 0.28 | 2.29 | 5.64 | 1.04 | 3.92 | 1065 | 0 |
| 2 | 13.2 | 1.78 | 2.14 | 11.2 | 1 | 2.65 | 2.76 | 0.26 | 1.28 | 4.38 | 1.05 | 3.49 | 1050 | 0 |
| 3 | 13.16 | 2.36 | 2.67 | 18.6 | 101 | 2.8 | 3.24 | 0.3 | 2.81 | 5.6799 | 1.03 | 3.17 | 1185 | 1 |
| 4 | 14.37 | 1.95 | 2.5 | 16.8 | 113 | 3.85 | 3.49 | 0.24 | 2.18 | 7.8 | 0.86 | 3.45 | 1480 | 1 |
| 5 | 13.24 | 2.59 | 2.87 | 21 | 118 | 2.8 | 2.69 | 0.39 | 1.82 | 4.32 | 1.04 | 2.93 | 735 | 0 |
| 6 | 14.2 | 1.76 | 2.45 | 15.2 | 112 | 3.27 | 3.39 | 0.34 | 1.97 | 6.75 | 1.05 | 2.85 | 1450 | 1 |
| 7 | 14.39 | 1.87 | 2.45 | 14.6 | 96 | 2.5 | 2.52 | 0.3 | 1.98 | 5.25 | 1.02 | 3.58 | 1290 | 1 |
| 8 | 14.06 | 2.15 | 2.61 | 17.6 | 121 | 2.6 | 2.51 | 0.31 | 1.25 | 5.05 | 1.06 | 3.58 | 1295 | 1 |
| 9 | 14.83 | 1.64 | 2.17 | 14 | 97 | 2.8 | 2.98 | 0.29 | 1.98 | 5.2 | 1.08 | 2.85 | 1045 | 0 |
| 10 | 13.86 | 1.35 | 2.27 | 16 | 98 | 2.98 | 3.15 | 0.22 | 1.85 | 7.2199 | 1.01 | 3.55 | 1045 | 0 |
| (10 rows) |
| </pre> |
| |
| <h4>Auto Clustering for Range of <em>k</em> Values</h4> |
| |
| -# Auto k selection. Now let's run k-means random for a variety of k values |
| and compare using simple silhouette and elbow methods. |
| <pre class="example"> |
| DROP TABLE IF EXISTS km_result_auto, km_result_auto_summary; |
| SELECT madlib.kmeans_random_auto( |
| 'km_sample', -- points table |
| 'km_result_auto', -- output table |
| 'points', -- column name in point table |
| ARRAY[2,3,4,5,6], -- k values to try |
| 'madlib.squared_dist_norm2', -- distance function |
| 'madlib.avg', -- aggregate function |
| 20, -- max iterations |
| 0.001, -- minimum fraction of centroids reassigned to continue iterating |
| 'both' -- k selection algorithm (simple silhouette and elbow) |
| ); |
| \\x on |
| SELECT * FROM km_result_auto_summary; |
| </pre> |
| <pre class="result">-[ RECORD 1 ]-------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| k | 6 |
| centroids | {{14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065},{13.24,2.59,2.87,21,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735},{14.285,1.855,2.475,16,112.5,3.56,3.44,0.29,2.075,7.275,0.955,3.15,1465},{13.87,2.12666666666667,2.57666666666667,16.9333333333333,106,2.63333333333333,2.75666666666667,0.303333333333333,2.01333333333333,5.32663333333333,1.03666666666667,3.44333333333333,1256.66666666667},{14.345,1.495,2.22,15,97.5,2.89,3.065,0.255,1.915,6.20995,1.045,3.2,1045},{13.2,1.78,2.14,11.2,1,2.65,2.76,0.26,1.28,4.38,1.05,3.49,1050}} |
| cluster_variance | {0,0,452.7633,8078.22646267333,5.346498005,0} |
| objective_fn | 8536.33626067833 |
| frac_reassigned | 0 |
| num_iterations | 3 |
| silhouette | 0.954496117675027 |
| elbow | -50296.3677976633 |
| selection_algorithm | silhouette |
| </pre> |
| The best selection above is made by the silhouette algorithm by default. |
| Note that the elbow method may select a different k value as the best. |
| To see results for all k values: |
| <pre class="example"> |
| SELECT * FROM km_result_auto ORDER BY k; |
| </pre> |
| <pre class="result"> |
| -[ RECORD 1 ]----+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| k | 2 |
| centroids | {{14.036,2.018,2.536,16.56,108.6,3.004,3.03,0.298,2.038,6.10598,1.004,3.326,1340},{13.872,1.814,2.376,15.56,88.2,2.806,2.928,0.288,1.844,5.35198,1.044,3.348,988}} |
| cluster_variance | {60672.638245208,90512.324426408} |
| objective_fn | 151184.962671616 |
| frac_reassigned | 0 |
| num_iterations | 3 |
| silhouette | 0.872087020146542 |
| elbow | -1056.25028126836 |
| -[ RECORD 2 ]----+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| k | 3 |
| centroids | {{13.87,2.12666666666667,2.57666666666667,16.9333333333333,106,2.63333333333333,2.75666666666667,0.303333333333333,2.01333333333333,5.32663333333333,1.03666666666667,3.44333333333333,1256.66666666667},{14.285,1.855,2.475,16,112.5,3.56,3.44,0.29,2.075,7.275,0.955,3.15,1465},{13.872,1.814,2.376,15.56,88.2,2.806,2.928,0.288,1.844,5.35198,1.044,3.348,988}} |
| cluster_variance | {8078.22646267333,452.7633,90512.324426408} |
| objective_fn | 99043.3141890814 |
| frac_reassigned | 0 |
| num_iterations | 2 |
| silhouette | 0.895849874221733 |
| elbow | 20549.7753189989 |
| -[ RECORD 3 ]----+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| k | 4 |
| centroids | {{14.02,1.765,2.385,16.05,105.75,2.845,3.1075,0.2725,2.2325,5.93495,1.04,3.3725,1085},{13.24,2.59,2.87,21,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735},{14.255,1.9325,2.5025,16.05,110.5,3.055,2.9775,0.2975,1.845,6.2125,0.9975,3.365,1378.75},{13.2,1.78,2.14,11.2,1,2.65,2.76,0.26,1.28,4.38,1.05,3.49,1050}} |
| cluster_variance | {14227.41709401,0,30561.74805,0} |
| objective_fn | 44789.16514401 |
| frac_reassigned | 0 |
| num_iterations | 3 |
| silhouette | 0.877839150000438 |
| elbow | 17535.7421610686 |
| -[ RECORD 4 ]----+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| k | 5 |
| centroids | {{14.285,1.855,2.475,16,112.5,3.56,3.44,0.29,2.075,7.275,0.955,3.15,1465},{14.225,2.01,2.53,16.1,108.5,2.55,2.515,0.305,1.615,5.15,1.04,3.58,1292.5},{13.2,1.78,2.14,11.2,1,2.65,2.76,0.26,1.28,4.38,1.05,3.49,1050},{14.04,1.8225,2.435,16.65,110,2.845,2.97,0.295,1.985,5.594975,1.0425,3.3125,972.5},{13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.6799,1.03,3.17,1185}} |
| cluster_variance | {452.7633,329.8988,0,76176.4564000075,0} |
| objective_fn | 76959.1185000075 |
| frac_reassigned | 0 |
| num_iterations | 2 |
| silhouette | 0.771207558995578 |
| elbow | -28690.3421973961 |
| -[ RECORD 5 ]----+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| k | 6 |
| centroids | {{14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065},{13.24,2.59,2.87,21,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735},{14.285,1.855,2.475,16,112.5,3.56,3.44,0.29,2.075,7.275,0.955,3.15,1465},{13.87,2.12666666666667,2.57666666666667,16.9333333333333,106,2.63333333333333,2.75666666666667,0.303333333333333,2.01333333333333,5.32663333333333,1.03666666666667,3.44333333333333,1256.66666666667},{14.345,1.495,2.22,15,97.5,2.89,3.065,0.255,1.915,6.20995,1.045,3.2,1045},{13.2,1.78,2.14,11.2,1,2.65,2.76,0.26,1.28,4.38,1.05,3.49,1050}} |
| cluster_variance | {0,0,452.7633,8078.22646267333,5.346498005,0} |
| objective_fn | 8536.33626067833 |
| frac_reassigned | 0 |
| num_iterations | 3 |
| silhouette | 0.954496117675027 |
| elbow | -50296.3677976633 |
| </pre> |
| -# Simplified silhouette per point. Let's say we want the simplified silhouette |
| coefficient for each point in the data set, for the case where <em>k=3</em>: |
| <pre class="example"> |
| DROP TABLE IF EXISTS km_points_silh; |
| SELECT * FROM madlib.simple_silhouette_points( 'km_sample', -- Input points table |
| 'km_points_silh', -- Output table |
| 'pid', -- Point ID column in input table |
| 'points', -- Points column in input table |
| (SELECT centroids FROM km_result_auto WHERE k=3), -- centroids array |
| 'madlib.squared_dist_norm2' -- Distance function |
| ); |
| SELECT * FROM km_points_silh ORDER BY pid; |
| </pre> |
| <pre class="result"> |
| pid | centroid_id | neighbor_centroid_id | silh |
| -----+-------------+----------------------+------------------- |
| 1 | 2 | 0 | 0.800019825058391 |
| 2 | 2 | 0 | 0.786712987562913 |
| 3 | 0 | 2 | 0.867496080386644 |
| 4 | 1 | 0 | 0.995466498952947 |
| 5 | 2 | 0 | 0.761551610381542 |
| 6 | 1 | 0 | 0.993950286967157 |
| 7 | 0 | 1 | 0.960621637528528 |
| 8 | 0 | 1 | 0.941493577405087 |
| 9 | 2 | 0 | 0.925822020308802 |
| 10 | 2 | 0 | 0.92536421766532 |
| (10 rows) |
| </pre> |
| |
| @anchor background |
| @par Technical Background |
| |
| <b>k-means Algorithm</b> |
| <br> |
| Formally, we wish to minimize the following objective function: |
| \f[ |
| (c_1, \dots, c_k) \mapsto \sum_{i=1}^n \min_{j=1}^k \operatorname{dist}(x_i, c_j) |
| \f] |
| In the most common case, \f$ \operatorname{dist} \f$ is the square of the |
| Euclidean distance, though other distance measures can be used. |
| |
| This problem is computationally difficult (NP-hard), yet the |
| local-search heuristic proposed by Lloyd <a href="#kmeans-lit-4">[4]</a> performs reasonably well in |
| practice. In fact, it is so ubiquitous today that it is |
| often referred to as the <em>standard algorithm</em> or even just the |
| <em>k-means algorithm</em>. It works as follows: |
| |
| -# Seed the \f$ k \f$ centroids, meaning specify their initial positions (see below). |
| -# Repeat until convergence: |
| -# Assign each point to its closest centroid. |
| -# Move each centroid to a position that minimizes the sum of distances in this |
| cluster. |
| -# Convergence is achieved when no points change their assignments during step |
| 2a. |
| |
| Since the objective function decreases in every step, this algorithm is |
| guaranteed to converge to a local optimum. |
| |
| The quality of k-means is highly influenced by the choice of the seeding. |
| In MADlib we support uniform-at-random sampling, kmeans++ as well as the |
| ability for the user to enter manual seeding based on other methods. |
| |
| k-means++ seeding <a href="#kmeans-lit-1">[1]</a> starts with a single centroid chosen randomly among the input points. It then |
| iteratively chooses new centroids from the input points until there are a total of k centroids. The |
| probability for picking a particular point is proportional to its minimum distance to any existing |
| centroid. Intuitively, k-means++ favors seedings where centroids are spread out over the whole |
| range of the input points, while at the same time not being too susceptible to outliers. |
| |
| @anchor simplified_silhouette |
| <b>Silhouette</b> |
| <br> |
| |
| To evaluate the validity of clustering with different k values, |
| the objective function is not ideal because it decreases as |
| k value increases, so it has a bias |
| toward selecting the largest k as the best result <a href="#kmeans-lit-6">[6]</a>. |
| Therefore we use other internal measures to evaluate cluster validity. |
| |
| One such measure is silhouette score. The original formulation <a href="#kmeans-lit-7">[7]</a> |
| computes the average distance of a data |
| point to all the other data points in its own cluster, and to all the data points |
| in the neighbouring cluster nearest to the data point. |
| This is expensive for a large number of points |
| since it requires the full pairwise distance matrix over all data. |
| |
| In the simplified silhouette <a href="#kmeans-lit-3">[3]</a> which is used in MADlib, |
| the distance of a data point to a cluster |
| is represented with the distance to the cluster centroid instead of the |
| average distance to all (other) data points in the cluster. |
| This has the advantage of being much faster to compute, and can be shown |
| to be comparable in performance to the full silhouette <a href="#kmeans-lit-8">[8]</a>. |
| |
| @anchor elbow |
| <b>Elbow Method</b> |
| <br> |
| The elbow method considers the percentage of variance explained as a function of number of clusters. |
| The idea is not to add another cluster if it doesn't model the data better. |
| Graphically it means identifying the "elbow" in the curve of sum of squared errors |
| vs. number of clusters (k). This was possibly originally suggested in <a href="#kmeans-lit-9">[9]</a>. |
| To locate the elbow, we use the 2nd derivative of the |
| objective function using the NumPy gradient function <a href="#kmeans-lit-2">[2]</a>. |
| |
| @anchor literature |
| @literature |
| |
| @anchor kmeans-lit-1 |
| [1] David Arthur, Sergei Vassilvitskii: k-means++: the advantages of careful |
| seeding, Proceedings of the 18th Annual ACM-SIAM Symposium on Discrete |
| Algorithms (SODA'07), pp. 1027-1035. |
| |
| @anchor kmeans-lit-2 |
| [2] NumPy gradient function, https://docs.scipy.org/doc/numpy/reference/generated/numpy.gradient.html |
| |
| @anchor kmeans-lit-3 |
| [3] E. R. Hruschka, L. N. C. Silva, R. J. G. B. Campello: Clustering |
| Gene-Expression Data: A Hybrid Approach that Iterates Between k-Means and |
| Evolutionary Search. In: Studies in Computational Intelligence - Hybrid |
| Evolutionary Algorithms. pp. 313-335. Springer. 2007. |
| |
| @anchor kmeans-lit-4 |
| [4] Lloyd, Stuart: Least squares quantization in PCM. Technical Note, Bell |
| Laboratories. Published much later in: IEEE Transactions on Information |
| Theory 28(2), pp. 128-137. 1982. |
| |
| @anchor kmeans-lit-5 |
| [5] Leisch, Friedrich: A Toolbox for K-Centroids Cluster Analysis. In: Computational |
| Statistics and Data Analysis, 51(2). pp. 526-544. 2006. |
| |
| @anchor kmeans-lit-6 |
| [6] Jain, A.K.: Data clustering: 50 years beyond k-means. Pattern recognition |
| letters 31(8), 651–666 (2010) |
| |
| @anchor kmeans-lit-7 |
| [7] Peter J. Rousseeuw (1987). “Silhouettes: a Graphical Aid to the Interpretation |
| and Validation of Cluster Analysis”. Computational and Applied Mathematics 20: 53-65 |
| |
| @anchor kmeans-lit-8 |
| [8] Wang F., Franco-Penya HH., Kelleher J.D., Pugh J., Ross R. (2017) An Analysis |
| of the Application of Simplified Silhouette to the Evaluation of k-means Clustering |
| Validity. In: Perner P. (eds) Machine Learning and Data Mining in Pattern Recognition. |
| MLDM 2017. Lecture Notes in Computer Science, vol 10358. Springer, Cham |
| |
| @anchor kmeans-lit-9 |
| [9] Robert L. Thorndike (December 1953). "Who Belongs in the Family?". Psychometrika. 18 (4): 267–276. doi:10.1007/BF02289263. |
| |
| @anchor related |
| @par Related Topics |
| |
| File kmeans.sql_in documenting the k-Means SQL functions |
| |
| \ref grp_svec |
| |
| closest_column() |
| |
| array_unnest_2d_to_1d() |
| |
| @internal |
| @sa namespace kmeans (documenting the implementation in Python) |
| @endinternal |
| */ |
| |
| /* |
| * @brief k-Means return type |
| * |
| * A composite value: |
| * - <tt>centroids</tt> - Matrix containing the new \f$ l \leq k \f$ |
| * repositioned centroids as columns. If this matrix has \f$ l < k \f$ |
| * columns, one or more old centroids no longer were closest to any point. |
| * - <tt>old_centroid_ids</tt> - The order of the centroids in |
| * <tt>centroid</tt> is not guaranteed to be consitent across iterations. |
| * In particular, if a centroid is no longer closest to any point it can be |
| * dropped and a new centroid is added afterwards. We therefore need to map |
| * positions in <tt>centroids</tt> to the respective positions in the |
| * previous iteration. |
| * - <tt>objective_fn</tt> - Value of the objective function, i.e., |
| * \f$ \sum_{x \in P} \dist(x, C)^2 \f$ where |
| * \f$ P \f$ is the set of points, \f$ C \f$ is the set of centroids, and |
| * \f$ \dist(x, C) := \min_{c \in C} \operatorname{dist}(x, c) \f$. |
| * - <tt>frac_reassigned</tt> - Fraction of points that was assigned a |
| * different centroid in the current iteration. |
| * - <tt>num_iterations</tt> - Number of iterations performed (so far). |
| */ |
| DROP TYPE IF EXISTS MADLIB_SCHEMA.kmeans_result CASCADE; |
| CREATE TYPE MADLIB_SCHEMA.kmeans_result AS ( |
| centroids DOUBLE PRECISION[][], |
| cluster_variance DOUBLE PRECISION[], |
| objective_fn DOUBLE PRECISION, |
| frac_reassigned DOUBLE PRECISION, |
| num_iterations INTEGER |
| ); |
| |
| /* |
| * @brief k-Means inter-iteration state type |
| * |
| * A composite value like \ref{kmeans_result}. Additional fields: |
| * - <tt>old_centroid_its</tt> - The order of the centroids in |
| * <tt>centroid</tt> is not guaranteed to be consitent across iterations. |
| * In particular, if a centroid is no longer closest to any point it can be |
| * dropped and a new centroid is added afterwards. We therefore need to map |
| * positions in <tt>centroids</tt> to the respective positions in the |
| * previous iteration. |
| * - <tt>num_iterations</tt> - Number of iterations performed (so far). |
| */ |
| DROP TYPE IF EXISTS MADLIB_SCHEMA.kmeans_state CASCADE; |
| CREATE TYPE MADLIB_SCHEMA.kmeans_state AS ( |
| centroids DOUBLE PRECISION[][], |
| old_centroid_ids INTEGER[], |
| cluster_variance DOUBLE PRECISION[], |
| objective_fn DOUBLE PRECISION, |
| frac_reassigned DOUBLE PRECISION |
| ); |
| |
| /** |
| * @internal |
| * @brief Execute a SQL command where $1, ..., $4 are substituted with the |
| * given arguments. |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.internal_execute_using_kmeans_args( |
| sql VARCHAR, DOUBLE PRECISION[][], REGPROC, INTEGER, DOUBLE PRECISION, VARCHAR |
| ) RETURNS VOID |
| VOLATILE |
| CALLED ON NULL INPUT |
| LANGUAGE c |
| AS 'MODULE_PATHNAME', 'exec_sql_using' |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.internal_compute_kmeans( |
| rel_args VARCHAR, |
| rel_state VARCHAR, |
| rel_source VARCHAR, |
| expr_point VARCHAR, |
| agg_centroid VARCHAR) |
| RETURNS INTEGER |
| VOLATILE |
| LANGUAGE plpythonu |
| AS $$PythonFunction(kmeans, kmeans, compute_kmeans)$$ |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| -- Validate the kmeans arguments |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__kmeans_validate_src( |
| rel_source VARCHAR |
| ) RETURNS VOID AS $$ |
| PythonFunction(kmeans, kmeans, kmeans_validate_src) |
| $$ LANGUAGE plpythonu |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__kmeans_validate_expr( |
| rel_source VARCHAR, |
| expr_point VARCHAR |
| ) RETURNS BOOLEAN AS $$ |
| PythonFunction(kmeans, kmeans, kmeans_validate_expr) |
| $$ LANGUAGE plpythonu |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__seeding_validate_args( |
| rel_source VARCHAR, |
| expr_point VARCHAR, |
| k INTEGER, |
| initial_centroids DOUBLE PRECISION[][], /*+ DEFAULT NULL */ |
| seeding_sample_ratio DOUBLE PRECISION /*+ DEFAULT 1.0 */ |
| ) RETURNS VOID AS $$ |
| DECLARE |
| num_centroids INTEGER; |
| num_points INTEGER; |
| array_dim INTEGER[]; |
| rel_source_regclass REGCLASS; |
| rel_filtered VARCHAR; |
| BEGIN |
| |
| -- Validate the expr_point input. Since we don't need a view at this |
| -- point, the output is safe to ignore. |
| PERFORM MADLIB_SCHEMA.__kmeans_validate_expr(rel_source,expr_point); |
| |
| rel_source_regclass := rel_source; |
| |
| IF (initial_centroids IS NOT NULL) THEN |
| num_centroids := array_upper(initial_centroids,1); |
| ELSE |
| num_centroids := k; |
| END IF; |
| |
| IF (k <= 0) THEN |
| RAISE EXCEPTION 'Number of clusters k must be a positive integer.'; |
| END IF; |
| IF (k > 32767) THEN |
| RAISE EXCEPTION 'Kmeans Error: Number of clusters exceeded maximum limit |
| Number of clusters k must be <= 32767 (for results to be returned in a |
| reasonable amount of time).'; |
| END IF; |
| |
| EXECUTE $sql$ SELECT count(*) |
| FROM $sql$ || textin(regclassout(rel_source_regclass)) || $sql$ |
| WHERE abs(coalesce(MADLIB_SCHEMA.svec_elsum($sql$ || expr_point || $sql$), 'Infinity'::FLOAT8)) < 'Infinity'::FLOAT8 $sql$ |
| INTO num_points ; |
| IF (num_points < k OR num_points < num_centroids) THEN |
| RAISE EXCEPTION 'Number of centroids is greater than number of points.'; |
| END IF; |
| IF (k < num_centroids) THEN |
| RAISE WARNING 'Kmeans Warning: Number of initial centroids more than requested clusters. |
| Number of clusters k is less than number of supplied initial centroids. |
| Number of final clusters will equal number of supplied initial centroids.'; |
| END IF; |
| |
| IF (seeding_sample_ratio <= 0.) THEN |
| RAISE EXCEPTION 'Kmeans Error: Sampling ratio should be greater than 0 to perform seeding.'; |
| END IF; |
| |
| EXECUTE $sql$ |
| SELECT array_agg(length) FROM ( |
| SELECT ( |
| array_upper( $sql$ || expr_point || $sql$::float8[], 1) |
| - array_lower( $sql$ || expr_point || $sql$::float8[], 1) + 1 |
| ) AS length, |
| array_lower( $sql$ || expr_point || $sql$::float8[], 1) AS lower |
| FROM $sql$ || textin(regclassout(rel_source_regclass)) || $sql$ |
| WHERE abs(coalesce(MADLIB_SCHEMA.svec_elsum($sql$ || expr_point || $sql$), 'Infinity'::FLOAT8)) < 'Infinity'::FLOAT8 |
| GROUP BY length, lower) t $sql$ |
| INTO array_dim; |
| IF (array_upper(array_dim, 1) > 1) THEN |
| RAISE EXCEPTION 'Kmeans error: Feature column has arrays of different dimensions. |
| The input table contains arrays (evaluated from ''expr_point'') of different dimensions.'; |
| END IF; |
| IF (initial_centroids is not NULL) THEN |
| IF (array_upper(initial_centroids, 2) != array_dim[1]) THEN |
| RAISE EXCEPTION 'Kmeans error: mismatched dimensionality between initial centroids and feature array |
| The input table contains arrays (evaluated from expr_point) of |
| dimension (%) which is not same as dimension of |
| provided initial centroid (%).', |
| array_dim[1], array_upper(initial_centroids, 2); |
| END IF; |
| END IF; |
| END; |
| $$ LANGUAGE plpgsql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__seeding_validate_args( |
| rel_source VARCHAR, |
| expr_point VARCHAR, |
| k INTEGER, |
| initial_centroids DOUBLE PRECISION[][] /*+ DEFAULT NULL */ |
| ) RETURNS VOID AS $$ |
| DECLARE |
| BEGIN |
| PERFORM MADLIB_SCHEMA.__seeding_validate_args($1, $2, $3, $4, |
| 1.0::DOUBLE PRECISION); |
| END; |
| $$ LANGUAGE plpgsql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `'); |
| |
| /** |
| * @brief Perform Lloyd's k-means local-search heuristic |
| * |
| * @param rel_source Name of the relation containing input points |
| * @param expr_point Expression evaluating to point coordinates for each tuple |
| * @param initial_centroids Matrix containing the initial centroids as columns |
| * @param fn_dist Name of a function with signature |
| * <tt>DOUBLE PRECISION[] x DOUBLE PRECISION[] -> DOUBLE PRECISION</tt> that |
| * returns the distance between two points. The default is the |
| * \ref squared_dist_norm2(float8[],float8[]) "squared Euclidean distance". |
| * @param agg_centroid Name of an aggregate function with signature |
| * <tt>DOUBLE PRECISION[] -> DOUBLE PRECISION[]</tt> that, for each group |
| * of points, returns a centroid. In order for Lloyd's local-search |
| * heuristic to provably converge and to return a local minimum, this |
| * centroid should minimize the sum of distances between each point in the |
| * group and the centroid. The default is the |
| * \ref avg(float8[]) "average (mean/barycenter in Euclidean space)", |
| * which satisfies this property if <tt>fn_dist = 'squared_dist_norm2'</tt>. |
| * @param max_num_iterations Maximum number of iterations |
| * @param min_frac_reassigned Fraction of reassigned points below which |
| * convergence is assumed and the algorithm terminates |
| * @returns A composite value: |
| * - <tt>centroids</tt> - Matrix with \f$ k \f$ centroids as columns. |
| * - <tt>frac_reassigned</tt> - Fraction of points that were assigned a |
| * different centroid in the last iteration. |
| * - <tt>num_iterations</tt> - The number of iterations before the |
| * algorithm terminated |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeans( |
| rel_source VARCHAR, |
| expr_point VARCHAR, |
| initial_centroids DOUBLE PRECISION[][], |
| fn_dist VARCHAR /*+ DEFAULT 'squared_dist_norm2' */, |
| agg_centroid VARCHAR /*+ DEFAULT 'avg' */, |
| max_num_iterations INTEGER /*+ DEFAULT 20 */, |
| min_frac_reassigned DOUBLE PRECISION /*+ DEFAULT 0.001 */ |
| ) RETURNS MADLIB_SCHEMA.kmeans_result AS $$ |
| DECLARE |
| theIteration INTEGER; |
| theResult MADLIB_SCHEMA.kmeans_result; |
| oldClientMinMessages VARCHAR; |
| class_rel_source REGCLASS; |
| proc_fn_dist REGPROCEDURE; |
| proc_agg_centroid REGPROCEDURE; |
| rel_filtered VARCHAR; |
| num_points INTEGER; |
| k INTEGER; |
| centroids FLOAT8[]; |
| data_size INTEGER; |
| init_size INTEGER; |
| BEGIN |
| oldClientMinMessages := |
| (SELECT setting FROM pg_settings WHERE name = 'client_min_messages'); |
| EXECUTE 'SET client_min_messages TO warning'; |
| |
| PERFORM MADLIB_SCHEMA.__kmeans_validate_src(rel_source); |
| |
| IF (array_upper(initial_centroids,1) IS NULL) THEN |
| RAISE EXCEPTION 'Kmeans error: No valid initial centroids given.'; |
| END IF; |
| |
| centroids := ARRAY(SELECT unnest(initial_centroids)); |
| IF (SELECT MADLIB_SCHEMA.svec_elsum(centroids)) >= 'Infinity'::float THEN |
| RAISE EXCEPTION 'Kmeans error: At least one initial centroid has non-finite values.'; |
| END IF; |
| |
| -- init_size := (SELECT array_upper(initial_centroids[1], 1)); |
| -- EXECUTE 'SELECT ARRAY_UPPER(' || expr_point || ', 1) FROM ' || rel_source || ' limit 1;' INTO data_size; |
| -- IF (init_size != data_size) THEN |
| -- RAISE EXCEPTION 'The initial centroids should have the same dimension as the data point!'; |
| -- END IF; |
| |
| class_rel_source := rel_source; |
| |
| proc_fn_dist := fn_dist |
| || '(DOUBLE PRECISION[], DOUBLE PRECISION[])'; |
| |
| -- Handle PG11 pg_proc table changes |
| IF (SELECT MADLIB_SCHEMA.is_pg_major_version_less_than(11) = TRUE) THEN |
| IF (SELECT prorettype != 'DOUBLE PRECISION'::regtype OR proisagg = TRUE |
| FROM pg_proc WHERE oid = proc_fn_dist) THEN |
| RAISE EXCEPTION 'Kmeans error: Distance function has wrong signature or is not a simple function.'; |
| END IF; |
| proc_agg_centroid := agg_centroid || '(DOUBLE PRECISION[])'; |
| IF (SELECT prorettype != 'DOUBLE PRECISION[]'::regtype OR proisagg = FALSE |
| FROM pg_proc WHERE oid = proc_agg_centroid) THEN |
| RAISE EXCEPTION 'Kmeans error: Mean aggregate has wrong signature or is not an aggregate.'; |
| END IF; |
| ELSE |
| IF (SELECT prorettype != 'DOUBLE PRECISION'::regtype OR prokind = 'a' |
| FROM pg_proc WHERE oid = proc_fn_dist) THEN |
| RAISE EXCEPTION 'Kmeans error: Distance function has wrong signature or is not a simple function.'; |
| END IF; |
| proc_agg_centroid := agg_centroid || '(DOUBLE PRECISION[])'; |
| IF (SELECT prorettype != 'DOUBLE PRECISION[]'::regtype OR prokind != 'a' |
| FROM pg_proc WHERE oid = proc_agg_centroid) THEN |
| RAISE EXCEPTION 'Kmeans error: Mean aggregate has wrong signature or is not an aggregate.'; |
| END IF; |
| END IF; |
| |
| IF (min_frac_reassigned < 0) OR (min_frac_reassigned > 1) THEN |
| RAISE EXCEPTION 'Kmeans error: Invalid convergence threshold (must be a fraction between 0 and 1).'; |
| END IF; |
| IF (max_num_iterations < 0) THEN |
| RAISE EXCEPTION 'Kmeans error: Number of iterations must be a non-negative integer.'; |
| END IF; |
| |
| -- Extra parameter check added so that ERROR output is more user-readable (doesn't include Python traceback) |
| k := array_upper(initial_centroids,1); |
| IF (k <= 0) THEN |
| RAISE EXCEPTION 'Kmeans error: Number of clusters k must be a positive integer.'; |
| END IF; |
| IF (k > 32767) THEN |
| RAISE EXCEPTION 'Kmeans error: Number of clusters k must be <= 32767 (for results to be returned in a reasonable amount of time).'; |
| END IF; |
| EXECUTE $sql$ SELECT count(*) |
| FROM $sql$ || textin(regclassout(class_rel_source)) || $sql$ |
| WHERE abs(coalesce(MADLIB_SCHEMA.svec_elsum($sql$ || expr_point || $sql$), 'Infinity'::FLOAT8)) < 'Infinity'::FLOAT8 $sql$ |
| INTO num_points ; |
| IF (num_points < k) THEN |
| RAISE EXCEPTION 'Kmeans error: Number of centroids is greater than number of points.'; |
| END IF; |
| |
| -- We first setup the argument table. Rationale: We want to avoid all data |
| -- conversion between native types and Python code. Instead, we use Python |
| -- as a pure driver layer. |
| PERFORM MADLIB_SCHEMA.create_schema_pg_temp(); |
| |
| -- Unfortunately, the EXECUTE USING syntax is only available starting |
| -- PostgreSQL 8.4: |
| -- http://www.postgresql.org/docs/8.4/static/plpgsql-statements.html#PLPGSQL-STATEMENTS-EXECUTING-DYN |
| -- We therefore have to emulate. |
| PERFORM MADLIB_SCHEMA.internal_execute_using_kmeans_args($sql$ |
| DROP TABLE IF EXISTS pg_temp._madlib_kmeans_args; |
| CREATE TABLE pg_temp._madlib_kmeans_args AS |
| SELECT |
| $1 AS initial_centroids, |
| array_upper($1, 1) AS k, |
| $2 AS fn_dist, |
| $3 AS max_num_iterations, |
| $4 AS min_frac_reassigned, |
| $5 as fn_dist_name; |
| $sql$, |
| initial_centroids, proc_fn_dist, max_num_iterations, |
| min_frac_reassigned, fn_dist); |
| |
| -- Perform acutal computation. |
| -- Unfortunately, Greenplum and PostgreSQL <= 8.2 do not have conversion |
| -- operators from regclass to varchar/text. |
| theIteration := MADLIB_SCHEMA.internal_compute_kmeans( |
| '_madlib_kmeans_args', |
| '_madlib_kmeans_state', |
| textin(regclassout(class_rel_source)), expr_point, |
| textin(regprocout(proc_agg_centroid))); |
| |
| -- Retrieve result from state table and return it |
| EXECUTE |
| $sql$ |
| SELECT (_state).centroids, (_state).cluster_variance, (_state).objective_fn, |
| (_state).frac_reassigned, NULL |
| FROM _madlib_kmeans_state |
| WHERE _iteration = $sql$ || theIteration || $sql$ |
| $sql$ |
| INTO theResult; |
| -- The number of iterations are not updated in the C++ code. We do it here. |
| IF NOT (theResult IS NULL) THEN |
| theResult.num_iterations = theIteration; |
| END IF; |
| EXECUTE 'SET client_min_messages TO ' || oldClientMinMessages; |
| RETURN theResult; |
| END; |
| $$ LANGUAGE plpgsql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeans( |
| rel_source VARCHAR, |
| expr_point VARCHAR, |
| initial_centroids DOUBLE PRECISION[][], |
| fn_dist VARCHAR, |
| agg_centroid VARCHAR, |
| max_num_iterations INTEGER |
| ) RETURNS MADLIB_SCHEMA.kmeans_result |
| VOLATILE |
| LANGUAGE sql AS $$ |
| SELECT MADLIB_SCHEMA.kmeans($1, $2, $3, $4, $5, $6, 0.001) |
| $$ |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeans( |
| rel_source VARCHAR, |
| expr_point VARCHAR, |
| initial_centroids DOUBLE PRECISION[][], |
| fn_dist VARCHAR, |
| agg_centroid VARCHAR |
| ) RETURNS MADLIB_SCHEMA.kmeans_result |
| VOLATILE |
| LANGUAGE sql AS $$ |
| SELECT MADLIB_SCHEMA.kmeans($1, $2, $3, $4, $5, 20, 0.001) |
| $$ |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeans( |
| rel_source VARCHAR, |
| expr_point VARCHAR, |
| initial_centroids DOUBLE PRECISION[][], |
| fn_dist VARCHAR |
| ) RETURNS MADLIB_SCHEMA.kmeans_result |
| VOLATILE |
| LANGUAGE sql AS $$ |
| SELECT MADLIB_SCHEMA.kmeans($1, $2, $3, $4, 'MADLIB_SCHEMA.avg', 20, |
| 0.001) |
| $$ |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeans( |
| rel_source VARCHAR, |
| expr_point VARCHAR, |
| initial_centroids DOUBLE PRECISION[][] |
| ) RETURNS MADLIB_SCHEMA.kmeans_result |
| VOLATILE |
| LANGUAGE sql AS $$ |
| SELECT MADLIB_SCHEMA.kmeans($1, $2, $3, |
| 'MADLIB_SCHEMA.squared_dist_norm2', 'MADLIB_SCHEMA.avg', 20, 0.001) |
| $$ |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.internal_execute_using_kmeanspp_seeding_args( |
| sql VARCHAR, INTEGER, REGPROC, DOUBLE PRECISION[][], VARCHAR |
| ) RETURNS VOID |
| VOLATILE |
| CALLED ON NULL INPUT |
| LANGUAGE c |
| AS 'MODULE_PATHNAME', 'exec_sql_using' |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.internal_compute_kmeanspp_seeding( |
| rel_args VARCHAR, |
| rel_state VARCHAR, |
| rel_source VARCHAR, |
| expr_point VARCHAR) |
| RETURNS INTEGER |
| AS $$PythonFunction(kmeans, kmeans, compute_kmeanspp_seeding)$$ |
| LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| /** |
| * @brief k-Means++ Seeding |
| * |
| * @param rel_source Name of the relation containing input points |
| * @param expr_point Expression evaluating to point coordinates for each tuple |
| * @param k Number of centroids |
| * @param fn_dist Name of a function with signature |
| * <tt>DOUBLE PRECISION[] x DOUBLE PRECISION[] -> DOUBLE PRECISION</tt> that |
| * returns the distance between two points |
| * @param initial_centroids A matrix containing up to \f$ k \f$ columns as |
| * columns. kmeanspp_seeding() proceeds exactly as if these centroids had |
| * already been generated in previous iterations. This parameter may be |
| * NULL in which all \f$ k \f$ centroids will be generated. |
| * @returns A matrix containing \f$ k \f$ centroids as columns |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeanspp_seeding( |
| rel_source VARCHAR, |
| expr_point VARCHAR, |
| k INTEGER, |
| fn_dist VARCHAR /*+ DEFAULT 'squared_dist_norm2' */, |
| initial_centroids DOUBLE PRECISION[][] /*+ DEFAULT NULL */, |
| seeding_sample_ratio DOUBLE PRECISION /*+ DEFAULT 1.0 */ |
| ) RETURNS DOUBLE PRECISION[][] AS $$ |
| DECLARE |
| theIteration INTEGER; |
| theResult DOUBLE PRECISION[][]; |
| oldClientMinMessages VARCHAR; |
| sampled_rel_source VARCHAR; |
| sampled_col_name VARCHAR; |
| class_rel_source REGCLASS; |
| rel_filtered VARCHAR; |
| proc_fn_dist REGPROCEDURE; |
| num_points INTEGER; |
| num_centroids INTEGER; |
| num_array_dim INTEGER; |
| BEGIN |
| oldClientMinMessages := |
| (SELECT setting FROM pg_settings WHERE name = 'client_min_messages'); |
| EXECUTE 'SET client_min_messages TO warning'; |
| |
| PERFORM MADLIB_SCHEMA.__kmeans_validate_src(rel_source); |
| PERFORM MADLIB_SCHEMA.__seeding_validate_args( |
| rel_source, expr_point, k, initial_centroids, seeding_sample_ratio); |
| |
| proc_fn_dist := fn_dist || '(DOUBLE PRECISION[], DOUBLE PRECISION[])'; |
| IF (SELECT MADLIB_SCHEMA.is_pg_major_version_less_than(11) = TRUE) THEN |
| IF (SELECT prorettype != 'DOUBLE PRECISION'::regtype OR proisagg = TRUE |
| FROM pg_proc WHERE oid = proc_fn_dist) THEN |
| RAISE EXCEPTION 'Kmeans error: Distance function has wrong signature or is not a simple function.'; |
| END IF; |
| ELSE |
| IF (SELECT prorettype != 'DOUBLE PRECISION'::regtype OR prokind = 'a' |
| FROM pg_proc WHERE oid = proc_fn_dist) THEN |
| RAISE EXCEPTION 'Kmeans error: Distance function has wrong signature or is not a simple function.'; |
| END IF; |
| END IF; |
| |
| -- Unfortunately, Greenplum and PostgreSQL <= 8.2 do not have conversion |
| -- operators from regclass to varchar/text. |
| class_rel_source := rel_source; |
| |
| sampled_rel_source = MADLIB_SCHEMA.__unique_string(); |
| sampled_col_name = MADLIB_SCHEMA.__unique_string(); |
| IF (seeding_sample_ratio < 1.0) THEN |
| EXECUTE 'DROP TABLE IF EXISTS '||sampled_rel_source||' CASCADE'; |
| EXECUTE 'CREATE TEMP TABLE '||sampled_rel_source||' AS |
| SELECT * |
| FROM |
| ( |
| SELECT |
| *, |
| random() AS '||sampled_col_name||' |
| FROM '||textin(regclassout(class_rel_source))||' |
| WHERE abs( |
| coalesce( |
| MADLIB_SCHEMA.svec_elsum('||expr_point||'), |
| ''Infinity''::FLOAT8 |
| ) |
| ) < ''Infinity''::FLOAT8 |
| ) subq |
| WHERE '||sampled_col_name||' < '||seeding_sample_ratio; |
| class_rel_source := sampled_rel_source; |
| END IF; |
| |
| -- We first setup the argument table. Rationale: We want to avoid all data |
| -- conversion between native types and Python code. Instead, we use Python |
| -- as a pure driver layer. |
| oldClientMinMessages := |
| (SELECT setting FROM pg_settings WHERE name = 'client_min_messages'); |
| EXECUTE 'SET client_min_messages TO warning'; |
| PERFORM MADLIB_SCHEMA.create_schema_pg_temp(); |
| -- Unfortunately, the EXECUTE USING syntax is only available starting |
| -- PostgreSQL 8.4: |
| -- http://www.postgresql.org/docs/8.4/static/plpgsql-statements.html#PLPGSQL-STATEMENTS-EXECUTING-DYN |
| -- We therefore have to emulate. |
| PERFORM MADLIB_SCHEMA.internal_execute_using_kmeanspp_seeding_args($sql$ |
| DROP TABLE IF EXISTS pg_temp._madlib_kmeanspp_args; |
| CREATE TEMPORARY TABLE _madlib_kmeanspp_args AS |
| SELECT |
| $1 AS k, |
| $2 AS fn_dist, |
| $3 AS initial_centroids, |
| $4 AS fn_dist_name |
| $sql$, |
| k, proc_fn_dist, initial_centroids, fn_dist); |
| EXECUTE 'SET client_min_messages TO ' || oldClientMinMessages; |
| |
| -- Perform acutal computation. |
| theIteration := ( |
| SELECT MADLIB_SCHEMA.internal_compute_kmeanspp_seeding( |
| '_madlib_kmeanspp_args', '_madlib_kmeanspp_state', |
| textin(regclassout(class_rel_source)), expr_point) |
| ); |
| |
| -- Retrieve result from state table and return it |
| EXECUTE |
| $sql$ |
| SELECT _state FROM _madlib_kmeanspp_state |
| WHERE _iteration = $sql$ || theIteration || $sql$ |
| $sql$ |
| INTO theResult; |
| EXECUTE 'SET client_min_messages TO ' || oldClientMinMessages; |
| |
| IF (seeding_sample_ratio < 1.0) THEN |
| EXECUTE 'DROP TABLE IF EXISTS '||sampled_rel_source||' CASCADE'; |
| END IF; |
| |
| RETURN theResult; |
| END; |
| $$ LANGUAGE plpgsql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeanspp_seeding( |
| rel_source VARCHAR, |
| expr_point VARCHAR, |
| k INTEGER, |
| fn_dist VARCHAR, |
| initial_centroids DOUBLE PRECISION[][] |
| ) RETURNS DOUBLE PRECISION[][] |
| LANGUAGE sql AS $$ |
| SELECT MADLIB_SCHEMA.kmeanspp_seeding($1, $2, $3, $4, $5, 1.0::DOUBLE PRECISION) |
| $$ |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeanspp_seeding( |
| rel_source VARCHAR, |
| expr_point VARCHAR, |
| k INTEGER, |
| fn_dist VARCHAR |
| ) RETURNS DOUBLE PRECISION[][] |
| LANGUAGE sql AS $$ |
| SELECT MADLIB_SCHEMA.kmeanspp_seeding($1, $2, $3, $4, NULL, 1.0::DOUBLE PRECISION) |
| $$ |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeanspp_seeding( |
| rel_source VARCHAR, |
| expr_point VARCHAR, |
| k INTEGER |
| ) RETURNS DOUBLE PRECISION[][] |
| LANGUAGE sql AS $$ |
| SELECT MADLIB_SCHEMA.kmeanspp_seeding($1, $2, $3, |
| 'MADLIB_SCHEMA.squared_dist_norm2', NULL, 1.0::DOUBLE PRECISION) |
| $$ |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| /** |
| * @brief Run k-Means++. |
| * |
| * This is a shortcut for running k-means++. It is equivalent to |
| * <pre>SELECT \ref kmeans( |
| rel_source, |
| expr_point, |
| \ref kmeanspp_seeding( |
| rel_source, |
| expr_point, |
| k, |
| fn_dist |
| ), |
| fn_dist, |
| agg_centroid, |
| max_num_iterations, |
| min_frac_reassigned |
| )</pre> |
| */ |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeanspp( |
| rel_source VARCHAR, |
| expr_point VARCHAR, |
| k INTEGER, |
| fn_dist VARCHAR /*+ DEFAULT 'squared_dist_norm2' */, |
| agg_centroid VARCHAR /*+ DEFAULT 'avg' */, |
| max_num_iterations INTEGER /*+ DEFAULT 20 */, |
| min_frac_reassigned DOUBLE PRECISION /*+ DEFAULT 0.001 */, |
| seeding_sample_ratio DOUBLE PRECISION /*+ DEFAULT 1.0 */ |
| ) RETURNS MADLIB_SCHEMA.kmeans_result |
| VOLATILE |
| LANGUAGE plpgsql |
| AS $$ |
| DECLARE |
| ret MADLIB_SCHEMA.kmeans_result; |
| BEGIN |
| ret = MADLIB_SCHEMA.kmeans( |
| $1, $2, MADLIB_SCHEMA.kmeanspp_seeding($1, $2, $3, $4, NULL, $8), |
| $4, $5, $6, $7); |
| RETURN ret; |
| END |
| $$ |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeanspp( |
| rel_source VARCHAR, |
| expr_point VARCHAR, |
| k INTEGER, |
| fn_dist VARCHAR /*+ DEFAULT 'squared_dist_norm2' */, |
| agg_centroid VARCHAR /*+ DEFAULT 'avg' */, |
| max_num_iterations INTEGER /*+ DEFAULT 20 */, |
| min_frac_reassigned DOUBLE PRECISION /*+ DEFAULT 0.001 */ |
| ) RETURNS MADLIB_SCHEMA.kmeans_result |
| VOLATILE |
| LANGUAGE plpgsql |
| AS $$ |
| DECLARE |
| ret MADLIB_SCHEMA.kmeans_result; |
| BEGIN |
| ret = MADLIB_SCHEMA.kmeanspp($1, $2, $3, $4, $5, $6, $7, 1.0::DOUBLE PRECISION); |
| RETURN ret; |
| END |
| $$ |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeanspp( |
| rel_source VARCHAR, |
| expr_point VARCHAR, |
| k INTEGER, |
| fn_dist VARCHAR, |
| agg_centroid VARCHAR, |
| max_num_iterations INTEGER |
| ) RETURNS MADLIB_SCHEMA.kmeans_result |
| VOLATILE |
| LANGUAGE plpgsql |
| AS $$ |
| DECLARE |
| ret MADLIB_SCHEMA.kmeans_result; |
| BEGIN |
| ret = MADLIB_SCHEMA.kmeanspp($1, $2, $3, $4, $5, $6, |
| 0.001::DOUBLE PRECISION, 1.0::DOUBLE PRECISION); |
| RETURN ret; |
| END |
| $$ |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeanspp( |
| rel_source VARCHAR, |
| expr_point VARCHAR, |
| k INTEGER, |
| fn_dist VARCHAR, |
| agg_centroid VARCHAR |
| ) RETURNS MADLIB_SCHEMA.kmeans_result |
| VOLATILE |
| LANGUAGE plpgsql |
| AS $$ |
| DECLARE |
| ret MADLIB_SCHEMA.kmeans_result; |
| BEGIN |
| ret = MADLIB_SCHEMA.kmeanspp($1, $2, $3, $4, $5, 20::INTEGER, |
| 0.001::DOUBLE PRECISION, 1.0::DOUBLE PRECISION); |
| RETURN ret; |
| END |
| $$ |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeanspp( |
| rel_source VARCHAR, |
| expr_point VARCHAR, |
| k INTEGER, |
| fn_dist VARCHAR |
| ) RETURNS MADLIB_SCHEMA.kmeans_result |
| VOLATILE |
| LANGUAGE plpgsql |
| AS $$ |
| DECLARE |
| ret MADLIB_SCHEMA.kmeans_result; |
| BEGIN |
| ret = MADLIB_SCHEMA.kmeanspp($1, $2, $3, $4, |
| 'MADLIB_SCHEMA.avg'::VARCHAR, 20::INTEGER, |
| 0.001::DOUBLE PRECISION, 1.0::DOUBLE PRECISION); |
| RETURN ret; |
| END |
| $$ |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeanspp( |
| rel_source VARCHAR, |
| expr_point VARCHAR, |
| k INTEGER |
| ) RETURNS MADLIB_SCHEMA.kmeans_result |
| VOLATILE |
| LANGUAGE plpgsql |
| AS $$ |
| DECLARE |
| ret MADLIB_SCHEMA.kmeans_result; |
| BEGIN |
| ret = MADLIB_SCHEMA.kmeanspp($1, $2, $3, |
| 'MADLIB_SCHEMA.squared_dist_norm2'::VARCHAR, |
| 'MADLIB_SCHEMA.avg'::VARCHAR, 20::INTEGER, |
| 0.001::DOUBLE PRECISION, 1.0::DOUBLE PRECISION); |
| RETURN ret; |
| END |
| $$ |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.internal_execute_using_kmeans_random_seeding_args( |
| sql VARCHAR, INTEGER, DOUBLE PRECISION[][] |
| ) RETURNS VOID |
| VOLATILE |
| CALLED ON NULL INPUT |
| LANGUAGE c |
| AS 'MODULE_PATHNAME', 'exec_sql_using' |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.internal_compute_kmeans_random_seeding( |
| rel_args VARCHAR, |
| rel_state VARCHAR, |
| rel_source VARCHAR, |
| expr_point VARCHAR) |
| RETURNS INTEGER |
| AS $$PythonFunction(kmeans, kmeans, compute_kmeans_random_seeding)$$ |
| LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| /** |
| * @brief k-Means Random Seeding |
| * |
| * @param rel_source Name of the relation containing input points |
| * @param expr_point Expression evaluating to point coordinates for each tuple |
| * @param k Number of centroids |
| * @param initial_centroids A matrix containing up to \f$ k \f$ columns as |
| * columns. kmeanspp_seeding() proceeds exactly as if these centroids had |
| * already been generated in previous iterations. This parameter may be |
| * NULL in which all \f$ k \f$ centroids will be generated. |
| * @returns A matrix containing \f$ k \f$ centroids as columns |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeans_random_seeding( |
| rel_source VARCHAR, |
| expr_point VARCHAR, |
| k INTEGER, |
| initial_centroids DOUBLE PRECISION[][] /*+ DEFAULT NULL */ |
| ) RETURNS DOUBLE PRECISION[][] AS $$ |
| DECLARE |
| theIteration INTEGER; |
| theResult DOUBLE PRECISION[][]; |
| oldClientMinMessages VARCHAR; |
| class_rel_source REGCLASS; |
| num_points INTEGER; |
| num_centroids INTEGER; |
| rel_filtered VARCHAR; |
| BEGIN |
| oldClientMinMessages := |
| (SELECT setting FROM pg_settings WHERE name = 'client_min_messages'); |
| EXECUTE 'SET client_min_messages TO warning'; |
| |
| PERFORM MADLIB_SCHEMA.__kmeans_validate_src(rel_source); |
| PERFORM MADLIB_SCHEMA.__seeding_validate_args( |
| rel_source, expr_point, k, initial_centroids); |
| |
| -- We first setup the argument table. Rationale: We want to avoid all data |
| -- conversion between native types and Python code. Instead, we use Python |
| -- as a pure driver layer. |
| oldClientMinMessages := |
| (SELECT setting FROM pg_settings WHERE name = 'client_min_messages'); |
| EXECUTE 'SET client_min_messages TO warning'; |
| PERFORM MADLIB_SCHEMA.create_schema_pg_temp(); |
| -- Unfortunately, the EXECUTE USING syntax is only available starting |
| -- PostgreSQL 8.4: |
| -- http://www.postgresql.org/docs/8.4/static/plpgsql-statements.html#PLPGSQL-STATEMENTS-EXECUTING-DYN |
| -- We therefore have to emulate. |
| PERFORM MADLIB_SCHEMA.internal_execute_using_kmeans_random_seeding_args($sql$ |
| DROP TABLE IF EXISTS pg_temp._madlib_kmeans_random_args; |
| CREATE TEMPORARY TABLE _madlib_kmeans_random_args AS |
| SELECT $1 AS k, $2 AS initial_centroids; |
| $sql$, |
| k, initial_centroids); |
| EXECUTE 'SET client_min_messages TO ' || oldClientMinMessages; |
| |
| -- Perform acutal computation. |
| -- Unfortunately, Greenplum and PostgreSQL <= 8.2 do not have conversion |
| -- operators from regclass to varchar/text. |
| class_rel_source := rel_source; |
| theIteration := ( |
| SELECT MADLIB_SCHEMA.internal_compute_kmeans_random_seeding( |
| '_madlib_kmeans_random_args', '_madlib_kmeans_random_state', |
| textin(regclassout(class_rel_source)), expr_point) |
| ); |
| |
| -- Retrieve result from state table and return it |
| EXECUTE |
| $sql$ |
| SELECT _state FROM _madlib_kmeans_random_state |
| WHERE _iteration = $sql$ || theIteration || $sql$ |
| $sql$ |
| INTO theResult; |
| EXECUTE 'SET client_min_messages TO ' || oldClientMinMessages; |
| RETURN theResult; |
| END; |
| $$ LANGUAGE plpgsql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeans_random_seeding( |
| rel_source VARCHAR, |
| expr_point VARCHAR, |
| k INTEGER |
| ) RETURNS DOUBLE PRECISION[][] |
| LANGUAGE sql AS $$ |
| SELECT MADLIB_SCHEMA.kmeans_random_seeding($1, $2, $3, NULL) |
| $$ |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| /** |
| * @brief Run k-Means with random seeding. |
| * |
| * This is a shortcut for running k-means with random seeding. It is equivalent |
| * to |
| * <pre>SELECT \ref kmeans( |
| rel_source, |
| expr_point, |
| \ref kmeans_random_seeding( |
| rel_source, |
| expr_point, |
| k |
| ), |
| fn_dist, |
| agg_centroid, |
| max_num_iterations, |
| min_frac_reassigned |
| )</pre> |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeans_random( |
| rel_source VARCHAR, |
| expr_point VARCHAR, |
| k INTEGER, |
| fn_dist VARCHAR /*+ DEFAULT 'squared_dist_norm2' */, |
| agg_centroid VARCHAR /*+ DEFAULT 'avg' */, |
| max_num_iterations INTEGER /*+ DEFAULT 20 */, |
| min_frac_reassigned DOUBLE PRECISION /*+ DEFAULT 0.001 */ |
| ) RETURNS MADLIB_SCHEMA.kmeans_result |
| VOLATILE |
| LANGUAGE plpgsql |
| AS $$ |
| DECLARE |
| ret MADLIB_SCHEMA.kmeans_result; |
| BEGIN |
| ret = MADLIB_SCHEMA.kmeans( |
| $1, $2, MADLIB_SCHEMA.kmeans_random_seeding($1, $2, $3), |
| $4, $5, $6, $7); |
| RETURN ret; |
| END |
| $$ |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeans_random( |
| rel_source VARCHAR, |
| expr_point VARCHAR, |
| k INTEGER, |
| fn_dist VARCHAR, |
| agg_centroid VARCHAR, |
| max_num_iterations INTEGER |
| ) RETURNS MADLIB_SCHEMA.kmeans_result |
| VOLATILE |
| LANGUAGE plpgsql |
| AS $$ |
| DECLARE |
| ret MADLIB_SCHEMA.kmeans_result; |
| BEGIN |
| ret = MADLIB_SCHEMA.kmeans( |
| $1, $2, MADLIB_SCHEMA.kmeans_random_seeding($1, $2, $3), |
| $4, $5, $6, 0.001); |
| RETURN ret; |
| END |
| $$ |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeans_random( |
| rel_source VARCHAR, |
| expr_point VARCHAR, |
| k INTEGER, |
| fn_dist VARCHAR, |
| agg_centroid VARCHAR |
| ) RETURNS MADLIB_SCHEMA.kmeans_result |
| VOLATILE |
| LANGUAGE plpgsql |
| AS $$ |
| DECLARE |
| ret MADLIB_SCHEMA.kmeans_result; |
| BEGIN |
| ret = MADLIB_SCHEMA.kmeans( |
| $1, $2, MADLIB_SCHEMA.kmeans_random_seeding($1, $2, $3), |
| $4, $5, 20, 0.001); |
| RETURN ret; |
| END |
| $$ |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeans_random( |
| rel_source VARCHAR, |
| expr_point VARCHAR, |
| k INTEGER, |
| fn_dist VARCHAR |
| ) RETURNS MADLIB_SCHEMA.kmeans_result |
| VOLATILE |
| LANGUAGE plpgsql |
| AS $$ |
| DECLARE |
| ret MADLIB_SCHEMA.kmeans_result; |
| BEGIN |
| ret = MADLIB_SCHEMA.kmeans( |
| $1, $2, |
| MADLIB_SCHEMA.kmeans_random_seeding($1, $2, $3), |
| $4, 'MADLIB_SCHEMA.avg', 20, 0.001); |
| RETURN ret; |
| END |
| $$ |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeans_random( |
| rel_source VARCHAR, |
| expr_point VARCHAR, |
| k INTEGER |
| ) RETURNS MADLIB_SCHEMA.kmeans_result |
| VOLATILE |
| LANGUAGE plpgsql |
| AS $$ |
| DECLARE |
| ret MADLIB_SCHEMA.kmeans_result; |
| BEGIN |
| ret = MADLIB_SCHEMA.kmeans( |
| $1, $2, |
| MADLIB_SCHEMA.kmeans_random_seeding($1, $2, $3), |
| 'MADLIB_SCHEMA.squared_dist_norm2', 'MADLIB_SCHEMA.avg', 20, 0.001); |
| RETURN ret; |
| END |
| $$ |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| /** |
| * @internal |
| * @brief Execute a SQL command where $1, ..., $6 are substituted with the |
| * given arguments. |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.internal_execute_using_kmeans_args( |
| sql VARCHAR, rel_source VARCHAR, expr_point VARCHAR, |
| fn_dist VARCHAR, agg_centroid VARCHAR, max_num_iterations INTEGER, |
| min_frac_reassigned DOUBLE PRECISION |
| ) RETURNS MADLIB_SCHEMA.kmeans_result |
| VOLATILE |
| CALLED ON NULL INPUT |
| LANGUAGE c |
| AS 'MODULE_PATHNAME', 'exec_sql_using' |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| /** |
| * @brief Perform Lloyd's k-means local-search heuristic, but with initial |
| * centroids stored in a table |
| * |
| * This is a shortcut for running k-means with initial centroids stored in a |
| * table (as opposed to an array of centroids). It is equivalent |
| * to |
| * <pre>SELECT \ref kmeans( |
| rel_source, |
| expr_point, |
| (SELECT \ref matrix_agg($expr_centroid) FROM $rel_initial_centroids), |
| fn_dist, |
| agg_centroid, |
| max_num_iterations, |
| min_frac_reassigned |
| )</pre> |
| * where <tt>$expr_centroid</tt> and <tt>$rel_initial_centroids</tt> denote |
| * textual substituions. |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeans( |
| rel_source VARCHAR, |
| expr_point VARCHAR, |
| rel_initial_centroids VARCHAR, |
| expr_centroid VARCHAR, |
| fn_dist VARCHAR /*+ DEFAULT 'squared_dist_norm2' */, |
| agg_centroid VARCHAR /*+ DEFAULT 'avg' */, |
| max_num_iterations INTEGER /*+ DEFAULT 20 */, |
| min_frac_reassigned DOUBLE PRECISION /*+ DEFAULT 0.001 */ |
| ) RETURNS MADLIB_SCHEMA.kmeans_result |
| VOLATILE |
| LANGUAGE plpgsql |
| AS $$ |
| DECLARE |
| class_rel_initial_centroids REGCLASS; |
| theResult MADLIB_SCHEMA.kmeans_result; |
| BEGIN |
| class_rel_initial_centroids := rel_initial_centroids; |
| SELECT * FROM MADLIB_SCHEMA.internal_execute_using_kmeans_args($sql$ |
| SELECT MADLIB_SCHEMA.kmeans( |
| $1, $2, |
| ( |
| SELECT MADLIB_SCHEMA.matrix_agg(($sql$ || expr_centroid || $sql$)::FLOAT8[]) |
| FROM $sql$ || textin(regclassout(class_rel_initial_centroids)) |
| || $sql$ |
| ), |
| $3, $4, $5, $6) |
| $sql$, |
| rel_source, expr_point, |
| fn_dist, agg_centroid, max_num_iterations, min_frac_reassigned) |
| INTO theResult; |
| RETURN theResult; |
| END; |
| $$ |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeans( |
| rel_source VARCHAR, |
| expr_point VARCHAR, |
| rel_initial_centroids VARCHAR, |
| expr_centroid VARCHAR, |
| fn_dist VARCHAR, |
| agg_centroid VARCHAR, |
| max_num_iterations INTEGER |
| ) RETURNS MADLIB_SCHEMA.kmeans_result |
| VOLATILE |
| LANGUAGE sql AS $$ |
| SELECT MADLIB_SCHEMA.kmeans( |
| $1, $2, |
| $3, $4, $5, $6, $7, 0.001) |
| $$ |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeans( |
| rel_source VARCHAR, |
| expr_point VARCHAR, |
| rel_initial_centroids VARCHAR, |
| expr_centroid VARCHAR, |
| fn_dist VARCHAR, |
| agg_centroid VARCHAR |
| ) RETURNS MADLIB_SCHEMA.kmeans_result |
| VOLATILE |
| LANGUAGE sql AS $$ |
| SELECT MADLIB_SCHEMA.kmeans( |
| $1, $2, |
| $3, $4, $5, $6, 20, 0.001) |
| $$ |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeans( |
| rel_source VARCHAR, |
| expr_point VARCHAR, |
| rel_initial_centroids VARCHAR, |
| expr_centroid VARCHAR, |
| fn_dist VARCHAR |
| ) RETURNS MADLIB_SCHEMA.kmeans_result |
| VOLATILE |
| LANGUAGE sql AS $$ |
| SELECT MADLIB_SCHEMA.kmeans( |
| $1, $2, |
| $3, $4, $5, 'MADLIB_SCHEMA.avg', 20, 0.001) |
| $$ |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeans( |
| rel_source VARCHAR, |
| expr_point VARCHAR, |
| rel_initial_centroids VARCHAR, |
| expr_centroid VARCHAR |
| ) RETURNS MADLIB_SCHEMA.kmeans_result |
| VOLATILE |
| LANGUAGE sql AS $$ |
| SELECT MADLIB_SCHEMA.kmeans( |
| $1, $2, |
| $3, $4, |
| 'MADLIB_SCHEMA.squared_dist_norm2', 'MADLIB_SCHEMA.avg', 20, 0.001) |
| $$ |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| |
| /** |
| * @internal |
| * @brief Execute a SQL command where $1, ..., $3 are substituted with the |
| * given arguments. |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.internal_execute_using_silhouette_args( |
| sql VARCHAR, |
| centroids DOUBLE PRECISION[][], |
| fn_dist REGPROC, |
| fn_dist_name VARCHAR |
| ) RETURNS DOUBLE PRECISION |
| STABLE |
| CALLED ON NULL INPUT |
| LANGUAGE c |
| AS 'MODULE_PATHNAME', 'exec_sql_using' |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| /** |
| * @brief Compute a simplified version of the silhouette coefficient |
| * |
| * @param rel_source Name of the relation containing input points |
| * @param expr_point Expression evaluating to point coordinates \f$ x_i \f$ for |
| * each tuple |
| * @param centroids Matrix \f$ M = (\vec{m_0} \dots \vec{m_{k-1}}) |
| * \in \mathbb{R}^{d \times k} \f$ with \f$ k \f$ columns, where column |
| * \f$ i \f$ contains the position of centroid \f$ i \f$. |
| * @param fn_dist Name of a function with signature |
| * <tt>DOUBLE PRECISION[] x DOUBLE PRECISION[] -> DOUBLE PRECISION</tt> that |
| * returns the distance between two points |
| * @return For each point \f$ x_i \f$, let |
| * \f$ d_1( x_i ) \f$ and \f$ d_2( x_i ) \f$ be the distance to the closest |
| * and 2nd-closest centroid, respectively. If there is more than one |
| * closest centroids then \f$ d_1( x_i ) = d_2( x_i )\f$. |
| * The return value is the average, over all points \f$ x_i \f$, of |
| * \f[ |
| * \frac{d_2( x_i ) - d_1(x_i)}{d_2(x_i)}, |
| * \f] |
| * where 0/0 is interpreted as 0. |
| * Clearly, the simplified silhouette coefficient assumes values in |
| * \f$ [0,1] \f$. |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.simple_silhouette( |
| rel_source VARCHAR, |
| expr_point VARCHAR, |
| centroids DOUBLE PRECISION[][], |
| fn_dist VARCHAR /*+ DEFAULT 'squared_dist_norm2' */ |
| ) RETURNS DOUBLE PRECISION |
| LANGUAGE plpgsql VOLATILE |
| AS $$ |
| DECLARE |
| class_rel_source REGCLASS; |
| proc_fn_dist REGPROCEDURE; |
| rel_filtered VARCHAR; |
| ans DOUBLE PRECISION; |
| BEGIN |
| IF (array_upper(centroids,1) IS NULL) THEN |
| RAISE EXCEPTION 'Kmeans error: No valid centroids given.'; |
| END IF; |
| |
| PERFORM MADLIB_SCHEMA.__kmeans_validate_src(rel_source); |
| |
| -- Validate the expr_point input. Since we don't need a view at this |
| -- point, the output is safe to ignore. |
| PERFORM MADLIB_SCHEMA.__kmeans_validate_expr(rel_source,expr_point); |
| |
| class_rel_source := rel_source; |
| |
| proc_fn_dist := fn_dist |
| || '(DOUBLE PRECISION[], DOUBLE PRECISION[])'; |
| IF (SELECT MADLIB_SCHEMA.is_pg_major_version_less_than(11) = TRUE) THEN |
| IF (SELECT prorettype != 'DOUBLE PRECISION'::regtype OR proisagg = TRUE |
| FROM pg_proc WHERE oid = proc_fn_dist) THEN |
| RAISE EXCEPTION 'Kmeans error: Distance function has wrong signature or is not a simple function.'; |
| END IF; |
| ELSE |
| IF (SELECT prorettype != 'DOUBLE PRECISION'::regtype OR prokind = 'a' |
| FROM pg_proc WHERE oid = proc_fn_dist) THEN |
| RAISE EXCEPTION 'Kmeans error: Distance function has wrong signature or is not a simple function.'; |
| END IF; |
| END IF; |
| |
| ans := MADLIB_SCHEMA.internal_execute_using_silhouette_args($sql$ |
| SELECT |
| avg(CASE |
| WHEN distances[2] = 0 THEN 0 |
| ELSE (distances[2] - distances[1]) / distances[2] |
| END) |
| FROM ( |
| SELECT |
| (MADLIB_SCHEMA._closest_columns( |
| $1, |
| ($sql$ || expr_point || $sql$)::FLOAT8[], |
| 2::INTEGER, |
| $2, |
| $3 |
| )).distances |
| FROM |
| $sql$ || textin(regclassout(class_rel_source)) || $sql$ |
| WHERE abs(coalesce(MADLIB_SCHEMA.svec_elsum($sql$ || expr_point || $sql$), 'Infinity'::FLOAT8)) < 'Infinity'::FLOAT8 |
| ) AS two_shortest_distances |
| $sql$, |
| centroids, proc_fn_dist, fn_dist); |
| |
| RETURN ans; |
| END; |
| $$ |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.simple_silhouette( |
| rel_source VARCHAR, |
| expr_point VARCHAR, |
| centroids DOUBLE PRECISION[][] |
| ) RETURNS DOUBLE PRECISION |
| STABLE |
| LANGUAGE sql |
| AS $$ |
| SELECT MADLIB_SCHEMA.simple_silhouette($1, $2, $3, |
| 'MADLIB_SCHEMA.squared_dist_norm2') |
| $$ |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `'); |
| |
| /** |
| * @brief Run auto k-Means. |
| * |
| * |
| */ |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeanspp_auto( |
| rel_source VARCHAR, |
| output_table VARCHAR, |
| expr_point VARCHAR, |
| k INTEGER[], |
| fn_dist VARCHAR /*+ DEFAULT 'squared_dist_norm2' */, |
| agg_centroid VARCHAR /*+ DEFAULT 'avg' */, |
| max_num_iterations INTEGER /*+ DEFAULT 20 */, |
| min_frac_reassigned DOUBLE PRECISION /*+ DEFAULT 0.001 */, |
| seeding_sample_ratio DOUBLE PRECISION /*+ DEFAULT 1.0 */, |
| k_selection_algorithm VARCHAR /*+ DEFAULT 'silhouette' */ |
| ) RETURNS VOID AS $$ |
| PythonFunction(`kmeans', `kmeans_auto', `kmeanspp_auto') |
| $$ LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeanspp_auto( |
| rel_source VARCHAR, |
| output_table VARCHAR, |
| expr_point VARCHAR, |
| k INTEGER[], |
| fn_dist VARCHAR /*+ DEFAULT 'squared_dist_norm2' */, |
| agg_centroid VARCHAR /*+ DEFAULT 'avg' */, |
| max_num_iterations INTEGER /*+ DEFAULT 20 */, |
| min_frac_reassigned DOUBLE PRECISION /*+ DEFAULT 0.001 */, |
| seeding_sample_ratio DOUBLE PRECISION /*+ DEFAULT 1.0 */ |
| ) RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.kmeanspp_auto($1, $2, $3, $4, $5, $6, $7, $8, $9, NULL) |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeanspp_auto( |
| rel_source VARCHAR, |
| output_table VARCHAR, |
| expr_point VARCHAR, |
| k INTEGER[], |
| fn_dist VARCHAR /*+ DEFAULT 'squared_dist_norm2' */, |
| agg_centroid VARCHAR /*+ DEFAULT 'avg' */, |
| max_num_iterations INTEGER /*+ DEFAULT 20 */, |
| min_frac_reassigned DOUBLE PRECISION /*+ DEFAULT 0.001 */ |
| ) RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.kmeanspp_auto($1, $2, $3, $4, $5, $6, $7, $8, NULL, NULL) |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeanspp_auto( |
| rel_source VARCHAR, |
| output_table VARCHAR, |
| expr_point VARCHAR, |
| k INTEGER[], |
| fn_dist VARCHAR /*+ DEFAULT 'squared_dist_norm2' */, |
| agg_centroid VARCHAR /*+ DEFAULT 'avg' */, |
| max_num_iterations INTEGER /*+ DEFAULT 20 */ |
| ) RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.kmeanspp_auto($1, $2, $3, $4, $5, $6, $7, NULL, NULL, NULL) |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeanspp_auto( |
| rel_source VARCHAR, |
| output_table VARCHAR, |
| expr_point VARCHAR, |
| k INTEGER[], |
| fn_dist VARCHAR /*+ DEFAULT 'squared_dist_norm2' */, |
| agg_centroid VARCHAR /*+ DEFAULT 'avg' */ |
| ) RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.kmeanspp_auto($1, $2, $3, $4, $5, $6, NULL, NULL, NULL) |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeanspp_auto( |
| rel_source VARCHAR, |
| output_table VARCHAR, |
| expr_point VARCHAR, |
| k INTEGER[], |
| fn_dist VARCHAR /*+ DEFAULT 'squared_dist_norm2' */ |
| ) RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.kmeanspp_auto($1, $2, $3, $4, $5, NULL, NULL, NULL, NULL, NULL) |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeanspp_auto( |
| rel_source VARCHAR, |
| output_table VARCHAR, |
| expr_point VARCHAR, |
| k INTEGER[] |
| ) RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.kmeanspp_auto($1, $2, $3, $4, NULL, NULL, NULL, NULL, NULL, NULL) |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeans_random_auto( |
| rel_source VARCHAR, |
| output_table VARCHAR, |
| expr_point VARCHAR, |
| k INTEGER[], |
| fn_dist VARCHAR /*+ DEFAULT 'squared_dist_norm2' */, |
| agg_centroid VARCHAR /*+ DEFAULT 'avg' */, |
| max_num_iterations INTEGER /*+ DEFAULT 20 */, |
| min_frac_reassigned DOUBLE PRECISION /*+ DEFAULT 0.001 */, |
| k_selection_algorithm VARCHAR /*+ DEFAULT 'silhouette' */ |
| ) RETURNS VOID AS $$ |
| PythonFunction(`kmeans', `kmeans_auto', `kmeans_random_auto') |
| $$ LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeans_random_auto( |
| rel_source VARCHAR, |
| output_table VARCHAR, |
| expr_point VARCHAR, |
| k INTEGER[], |
| fn_dist VARCHAR /*+ DEFAULT 'squared_dist_norm2' */, |
| agg_centroid VARCHAR /*+ DEFAULT 'avg' */, |
| max_num_iterations INTEGER /*+ DEFAULT 20 */, |
| min_frac_reassigned DOUBLE PRECISION /*+ DEFAULT 0.001 */ |
| ) RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.kmeans_random_auto($1, $2, $3, $4, $5, $6, $7, $8, NULL) |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeans_random_auto( |
| rel_source VARCHAR, |
| output_table VARCHAR, |
| expr_point VARCHAR, |
| k INTEGER[], |
| fn_dist VARCHAR /*+ DEFAULT 'squared_dist_norm2' */, |
| agg_centroid VARCHAR /*+ DEFAULT 'avg' */, |
| max_num_iterations INTEGER /*+ DEFAULT 20 */ |
| ) RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.kmeans_random_auto($1, $2, $3, $4, $5, $6, $7, NULL, NULL) |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeans_random_auto( |
| rel_source VARCHAR, |
| output_table VARCHAR, |
| expr_point VARCHAR, |
| k INTEGER[], |
| fn_dist VARCHAR /*+ DEFAULT 'squared_dist_norm2' */, |
| agg_centroid VARCHAR /*+ DEFAULT 'avg' */ |
| ) RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.kmeans_random_auto($1, $2, $3, $4, $5, $6, NULL, NULL) |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeans_random_auto( |
| rel_source VARCHAR, |
| output_table VARCHAR, |
| expr_point VARCHAR, |
| k INTEGER[], |
| fn_dist VARCHAR /*+ DEFAULT 'squared_dist_norm2' */ |
| ) RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.kmeans_random_auto($1, $2, $3, $4, $5, NULL, NULL, NULL, NULL) |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeans_random_auto( |
| rel_source VARCHAR, |
| output_table VARCHAR, |
| expr_point VARCHAR, |
| k INTEGER[] |
| ) RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.kmeans_random_auto($1, $2, $3, $4, NULL, NULL, NULL, NULL, NULL) |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.simple_silhouette_points( |
| rel_source VARCHAR, |
| output_table VARCHAR, |
| pid VARCHAR, |
| expr_point VARCHAR, |
| centroids_table VARCHAR, |
| centroids_col VARCHAR, |
| fn_dist VARCHAR /*+ DEFAULT 'squared_dist_norm2' */ |
| ) RETURNS VOID AS $$ |
| PythonFunction(kmeans, kmeans, simple_silhouette_points_str_wrapper) |
| $$ LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.simple_silhouette_points( |
| rel_source VARCHAR, |
| output_table VARCHAR, |
| pid VARCHAR, |
| expr_point VARCHAR, |
| centroids_table VARCHAR, |
| centroids_col VARCHAR |
| ) RETURNS VOID |
| AS $$ |
| SELECT MADLIB_SCHEMA.simple_silhouette_points($1, $2, $3, $4, $5, $6, |
| 'MADLIB_SCHEMA.squared_dist_norm2') |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `'); |
| |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.simple_silhouette_points( |
| rel_source VARCHAR, |
| output_table VARCHAR, |
| pid VARCHAR, |
| expr_point VARCHAR, |
| centroids DOUBLE PRECISION[], |
| fn_dist VARCHAR /*+ DEFAULT 'squared_dist_norm2' */ |
| ) RETURNS VOID AS $$ |
| PythonFunction(kmeans, kmeans, simple_silhouette_points_dbl_wrapper) |
| $$ LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.simple_silhouette_points( |
| rel_source VARCHAR, |
| output_table VARCHAR, |
| pid VARCHAR, |
| expr_point VARCHAR, |
| centroids DOUBLE PRECISION[] |
| ) RETURNS VOID |
| AS $$ |
| SELECT MADLIB_SCHEMA.simple_silhouette_points($1, $2, $3, $4, $5, |
| 'MADLIB_SCHEMA.squared_dist_norm2') |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `'); |