| /* ----------------------------------------------------------------------- *//** |
| * |
| * @file kmeans.sql_in |
| * |
| * @brief SQL functions for k-means clustering |
| * @date January 2011 |
| * |
| * @sa For a brief introduction to k-means clustering, see the module |
| * description \ref grp_kmeans. |
| * |
| *//* ----------------------------------------------------------------------- */ |
| |
| /** |
| @addtogroup grp_kmeans |
| |
| @about |
| |
| K-means clustering algorithm helps in dividing a list of objects |
| into K groups based on the similarity of their attributes. |
| |
| Both objects and groups are represented as points in n-dimensional |
| space. The similarity is expressed as distance between objects (points) and |
| group centers (centroids). Each object is assigned to a group with |
| the nearest centroid in a series of iterations, the goal of which is |
| to minimize the total distance between points and their centroids. |
| The distance is currently measured using l2-norm, |
| also know as |
| <a href="http://en.wikipedia.org/wiki/Norm_(mathematics)#Euclidean_norm">Euclidean |
| distance</a>. |
| |
| |
| This method works on a set of data points accessible in a table or through a view. |
| Initial centroids are found according to |
| k-means++ algorithm [1]. |
| Further adjustments are based on the Euclidean distance between |
| the current centroids and all available data points or a random subset of them |
| , such that there are at least 200 points from each initial cluster. |
| |
| The algorithm stops when one of the following conditions is met: |
| - fraction of reassigned nodes is not growing |
| - fraction of reassigned nodes is smaller than the limit (default = 0.001) |
| - reached the maximum number of allowed iterations (default = 20) |
| |
| @prereq |
| |
| Requires MADlib \link grp_svec svec module\endlink, which provides a sparse |
| vector data type. |
| |
| @usage |
| |
| Function: <tt>kmeans_run( '<em>input_table</em>', <em>k</em>, |
| '<em>goodness</em>', '<em>run_id</em>', '<em>output_schema</em>')</tt> |
| |
| Parameters: |
| - <em>input_table</em> : table with the source data |
| - <em>k</em> : max number of clusters to consider |
| (must be smaller than number of input points) |
| - <em>goodness</em> : goodness of fit test flag [0,1] |
| - <em>run_id</em> : execution identifier |
| - <em>output_schema</em> : target schema for output tables (points, centroids) |
| |
| @examp |
| |
| 1) Prepare the input table/view with the following structure. This method |
| currently requires the following column names, as well as the data types: |
| |
| \code |
| pid INTEGER |
| position {SVEC|INT[]|FLOAT[]} |
| \endcode |
| |
| where: |
| - <em>pid</em> is the name of the column with an ID of the object (point) |
| - <em>position</em> is the attribute vector (point coordinates) |
| |
| 2) Populate the input table with some data. You can use a helper function (create_test_table) from the sql/setup.sql file. |
| |
| 3) Call kmeans_run() stored procedure, e.g.: |
| |
| \code |
| SQL> select madlib.kmeans_run( 'source_schema.source_name', 10, 1, 'my_test_run', 'my_schema'); |
| \endcode |
| |
| 4) Sample output: |
| |
| \code |
| INFO: Parameters: |
| INFO: * k = 10 (number of centroids) |
| INFO: * input_table = source_schema.source_name |
| INFO: * goodness = 1 (GOF test on) |
| INFO: * run_id = my_test_run |
| INFO: * output_schema = my_schema |
| INFO: Seeding 10 centroids... |
| INFO: Using sample data set for analysis... (9200 out of 10000 points) |
| INFO: ...Iteration 1 |
| INFO: ...Iteration 2 |
| INFO: Exit reason: fraction of reassigned nodes is smaller than the limit: 0.001 |
| INFO: Expanding cluster assignment to all points... |
| INFO: Calculating goodness of fit... |
| |
| Finished k-means clustering on "my_schema"."kmeans_input_table" table. |
| Results: |
| * 10 centroids (goodness of fit = 2.9787570001). |
| Output: |
| * table : "my_schema"."kmeans_out_centroids_testrun" |
| * table : "my_schema"."kmeans_out_points_testrun" |
| Time elapsed: 0 minutes 4.335644 seconds. |
| \endcode |
| |
| @sa file kmeans.sql_in (documenting the SQL functions) |
| |
| @internal |
| @sa namespace kmeans (documenting the implementation in Python) |
| @endinternal |
| |
| @literature |
| |
| [1] Wikipedia, K-means++, |
| http://en.wikipedia.org/wiki/K-means%2B%2B |
| */ |
| |
| BEGIN; |
| |
| |
| /** |
| * @internal |
| * Support function: takes a single SVEC (A) and an array of SVECs (B) |
| * and returns the index of (B) with the shortest distance to (A). |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__kmeans_closestID( |
| p_point MADLIB_SCHEMA.SVEC, p_centroids MADLIB_SCHEMA.SVEC[] |
| ) |
| RETURNS INTEGER |
| AS $$ |
| DECLARE |
| minCID INTEGER := 1; |
| min_val FLOAT; |
| temp_val FLOAT; |
| BEGIN |
| |
| -- Check the arguments |
| IF p_point is NULL or p_centroids is NULL THEN |
| RETURN null; |
| END IF; |
| |
| min_val = MADLIB_SCHEMA.l2norm( p_point - p_centroids[1]); |
| |
| FOR i IN 2..array_upper( p_centroids, 1) |
| LOOP |
| temp_val = MADLIB_SCHEMA.l2norm( p_point - p_centroids[i]); |
| IF ( temp_val < coalesce( min_val, temp_val + 1) ) THEN |
| min_val = temp_val; |
| minCID = i; |
| END IF; |
| END LOOP; |
| |
| RETURN minCID; |
| END |
| $$ LANGUAGE plpgsql; |
| |
| -- Finalize function for _kmeans_meanPosition() aggregate. |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__kmeans_mean_finalize( p_centroid MADLIB_SCHEMA.SVEC) |
| RETURNS MADLIB_SCHEMA.SVEC AS $$ |
| DECLARE |
| new_location FLOAT[]; |
| new_location2 FLOAT[]; |
| sum FLOAT; |
| BEGIN |
| new_location = MADLIB_SCHEMA.SVEC_return_array(p_centroid); |
| sum = new_location[array_upper(new_location, 1)]; |
| FOR i in 1..(array_upper(new_location, 1)-1) LOOP |
| new_location2[i] = new_location[i]/sum; |
| END LOOP; |
| RETURN MADLIB_SCHEMA.SVEC_cast_float8arr(new_location2); |
| END |
| $$ LANGUAGE plpgsql; |
| |
| -- Sfunc function for _kmeans_meanPosition() aggregate. |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__kmeans_mean_product(MADLIB_SCHEMA.SVEC, MADLIB_SCHEMA.SVEC) |
| RETURNS MADLIB_SCHEMA.SVEC AS $$ |
| DECLARE |
| new_location MADLIB_SCHEMA.SVEC; |
| BEGIN |
| new_location = MADLIB_SCHEMA.SVEC_concat($2,MADLIB_SCHEMA.SVEC_cast_float8(1.0)); |
| IF ($1 IS NOT NULL) THEN |
| new_location = $1 + new_location; |
| END IF; |
| RETURN new_location; |
| END |
| $$ LANGUAGE plpgsql; |
| |
| -- Prefunc function for _kmeans_meanPosition() aggregate. |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__kmeans_mean_aggr( MADLIB_SCHEMA.SVEC, MADLIB_SCHEMA.SVEC) RETURNS MADLIB_SCHEMA.SVEC AS $$ |
| DECLARE |
| BEGIN |
| IF (($1 IS NOT NULL) AND ($2 IS NOT NULL)) THEN |
| RETURN $1 + $2; |
| END IF; |
| IF ($1 IS NOT NULL) THEN |
| RETURN $1; |
| END IF; |
| IF ($2 IS NOT NULL) THEN |
| RETURN $2; |
| END IF; |
| END |
| $$ LANGUAGE plpgsql; |
| |
| /** |
| * @internal |
| * @brief Compute a mean position for a set of SVECs. |
| */ |
| CREATE AGGREGATE MADLIB_SCHEMA.__kmeans_meanPosition( MADLIB_SCHEMA.SVEC ) |
| ( |
| stype = MADLIB_SCHEMA.SVEC, |
| sfunc = MADLIB_SCHEMA.__kmeans_mean_product, |
| ifdef(`GREENPLUM',`prefunc = MADLIB_SCHEMA.__kmeans_mean_aggr,') |
| finalfunc = MADLIB_SCHEMA.__kmeans_mean_finalize |
| ); |
| |
| /** |
| * @brief Compute a k-means clustering |
| * |
| * @param input_table Name of relation containing the input data points |
| * @param k Number of centroids to generate |
| * @param goodness Goodness of fit test flag (allowed values: 0, 1) |
| * @param run_id Name/ID of the execution |
| * @param output_schema Target schema for the output tables |
| * @return Textual summary of the algorithm run, including the names of the |
| * created tables and run time statistics |
| * |
| * @internal |
| * @sa This function is a wrapper for kmeans::kmeans_run() |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeans_run( input_table text, k int, goodness int, run_id text, output_schema text) |
| RETURNS text |
| AS $$ |
| import sys |
| try: |
| from madlib import kmeans |
| except: |
| sys.path.append("PLPYTHON_LIBDIR") |
| from madlib import kmeans |
| |
| plpy.execute( 'set client_min_messages=warning'); |
| return kmeans.kmeans_run( input_table, k, goodness, run_id, output_schema); |
| $$ LANGUAGE plpythonu; |
| |
| COMMIT; |