blob: fc7f50d8981328b380ccf6fffcbf8e2183793e55 [file] [log] [blame]
/* ----------------------------------------------------------------------- *//**
*
* @file kmeans.sql_in
*
* @brief SQL functions for k-means clustering
* @date January 2011
*
* @sa For a brief introduction to k-means clustering, see the module
* description \ref grp_kmeans.
*
*//* ----------------------------------------------------------------------- */
/**
@addtogroup grp_kmeans
@about
K-means clustering algorithm helps in dividing a list of objects
into K groups based on the similarity of their attributes.
Both objects and groups are represented as points in n-dimensional
space. The similarity is expressed as distance between objects (points) and
group centers (centroids). Each object is assigned to a group with
the nearest centroid in a series of iterations, the goal of which is
to minimize the total distance between points and their centroids.
The distance is currently measured using l2-norm,
also know as
<a href="http://en.wikipedia.org/wiki/Norm_(mathematics)#Euclidean_norm">Euclidean
distance</a>.
This method works on a set of data points accessible in a table or through a view.
Initial centroids are found according to
k-means++ algorithm [1].
Further adjustments are based on the Euclidean distance between
the current centroids and all available data points or a random subset of them
, such that there are at least 200 points from each initial cluster.
The algorithm stops when one of the following conditions is met:
- fraction of reassigned nodes is not growing
- fraction of reassigned nodes is smaller than the limit (default = 0.001)
- reached the maximum number of allowed iterations (default = 20)
@prereq
Requires MADlib \link grp_svec svec module\endlink, which provides a sparse
vector data type.
@usage
Function: <tt>kmeans_run( '<em>input_table</em>', <em>k</em>,
'<em>goodness</em>', '<em>run_id</em>', '<em>output_schema</em>')</tt>
Parameters:
- <em>input_table</em> : table with the source data
- <em>k</em> : max number of clusters to consider
(must be smaller than number of input points)
- <em>goodness</em> : goodness of fit test flag [0,1]
- <em>run_id</em> : execution identifier
- <em>output_schema</em> : target schema for output tables (points, centroids)
@examp
1) Prepare the input table/view with the following structure. This method
currently requires the following column names, as well as the data types:
\code
pid INTEGER
position {SVEC|INT[]|FLOAT[]}
\endcode
where:
- <em>pid</em> is the name of the column with an ID of the object (point)
- <em>position</em> is the attribute vector (point coordinates)
2) Populate the input table with some data. You can use a helper function (create_test_table) from the sql/setup.sql file.
3) Call kmeans_run() stored procedure, e.g.:
\code
SQL> select madlib.kmeans_run( 'source_schema.source_name', 10, 1, 'my_test_run', 'my_schema');
\endcode
4) Sample output:
\code
INFO: Parameters:
INFO: * k = 10 (number of centroids)
INFO: * input_table = source_schema.source_name
INFO: * goodness = 1 (GOF test on)
INFO: * run_id = my_test_run
INFO: * output_schema = my_schema
INFO: Seeding 10 centroids...
INFO: Using sample data set for analysis... (9200 out of 10000 points)
INFO: ...Iteration 1
INFO: ...Iteration 2
INFO: Exit reason: fraction of reassigned nodes is smaller than the limit: 0.001
INFO: Expanding cluster assignment to all points...
INFO: Calculating goodness of fit...
Finished k-means clustering on "my_schema"."kmeans_input_table" table.
Results:
* 10 centroids (goodness of fit = 2.9787570001).
Output:
* table : "my_schema"."kmeans_out_centroids_testrun"
* table : "my_schema"."kmeans_out_points_testrun"
Time elapsed: 0 minutes 4.335644 seconds.
\endcode
@sa file kmeans.sql_in (documenting the SQL functions)
@internal
@sa namespace kmeans (documenting the implementation in Python)
@endinternal
@literature
[1] Wikipedia, K-means++,
http://en.wikipedia.org/wiki/K-means%2B%2B
*/
BEGIN;
/**
* @internal
* Support function: takes a single SVEC (A) and an array of SVECs (B)
* and returns the index of (B) with the shortest distance to (A).
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__kmeans_closestID(
p_point MADLIB_SCHEMA.SVEC, p_centroids MADLIB_SCHEMA.SVEC[]
)
RETURNS INTEGER
AS $$
DECLARE
minCID INTEGER := 1;
min_val FLOAT;
temp_val FLOAT;
BEGIN
-- Check the arguments
IF p_point is NULL or p_centroids is NULL THEN
RETURN null;
END IF;
min_val = MADLIB_SCHEMA.l2norm( p_point - p_centroids[1]);
FOR i IN 2..array_upper( p_centroids, 1)
LOOP
temp_val = MADLIB_SCHEMA.l2norm( p_point - p_centroids[i]);
IF ( temp_val < coalesce( min_val, temp_val + 1) ) THEN
min_val = temp_val;
minCID = i;
END IF;
END LOOP;
RETURN minCID;
END
$$ LANGUAGE plpgsql;
-- Finalize function for _kmeans_meanPosition() aggregate.
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__kmeans_mean_finalize( p_centroid MADLIB_SCHEMA.SVEC)
RETURNS MADLIB_SCHEMA.SVEC AS $$
DECLARE
new_location FLOAT[];
new_location2 FLOAT[];
sum FLOAT;
BEGIN
new_location = MADLIB_SCHEMA.SVEC_return_array(p_centroid);
sum = new_location[array_upper(new_location, 1)];
FOR i in 1..(array_upper(new_location, 1)-1) LOOP
new_location2[i] = new_location[i]/sum;
END LOOP;
RETURN MADLIB_SCHEMA.SVEC_cast_float8arr(new_location2);
END
$$ LANGUAGE plpgsql;
-- Sfunc function for _kmeans_meanPosition() aggregate.
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__kmeans_mean_product(MADLIB_SCHEMA.SVEC, MADLIB_SCHEMA.SVEC)
RETURNS MADLIB_SCHEMA.SVEC AS $$
DECLARE
new_location MADLIB_SCHEMA.SVEC;
BEGIN
new_location = MADLIB_SCHEMA.SVEC_concat($2,MADLIB_SCHEMA.SVEC_cast_float8(1.0));
IF ($1 IS NOT NULL) THEN
new_location = $1 + new_location;
END IF;
RETURN new_location;
END
$$ LANGUAGE plpgsql;
-- Prefunc function for _kmeans_meanPosition() aggregate.
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__kmeans_mean_aggr( MADLIB_SCHEMA.SVEC, MADLIB_SCHEMA.SVEC) RETURNS MADLIB_SCHEMA.SVEC AS $$
DECLARE
BEGIN
IF (($1 IS NOT NULL) AND ($2 IS NOT NULL)) THEN
RETURN $1 + $2;
END IF;
IF ($1 IS NOT NULL) THEN
RETURN $1;
END IF;
IF ($2 IS NOT NULL) THEN
RETURN $2;
END IF;
END
$$ LANGUAGE plpgsql;
/**
* @internal
* @brief Compute a mean position for a set of SVECs.
*/
CREATE AGGREGATE MADLIB_SCHEMA.__kmeans_meanPosition( MADLIB_SCHEMA.SVEC )
(
stype = MADLIB_SCHEMA.SVEC,
sfunc = MADLIB_SCHEMA.__kmeans_mean_product,
ifdef(`GREENPLUM',`prefunc = MADLIB_SCHEMA.__kmeans_mean_aggr,')
finalfunc = MADLIB_SCHEMA.__kmeans_mean_finalize
);
/**
* @brief Compute a k-means clustering
*
* @param input_table Name of relation containing the input data points
* @param k Number of centroids to generate
* @param goodness Goodness of fit test flag (allowed values: 0, 1)
* @param run_id Name/ID of the execution
* @param output_schema Target schema for the output tables
* @return Textual summary of the algorithm run, including the names of the
* created tables and run time statistics
*
* @internal
* @sa This function is a wrapper for kmeans::kmeans_run()
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeans_run( input_table text, k int, goodness int, run_id text, output_schema text)
RETURNS text
AS $$
import sys
try:
from madlib import kmeans
except:
sys.path.append("PLPYTHON_LIBDIR")
from madlib import kmeans
plpy.execute( 'set client_min_messages=warning');
return kmeans.kmeans_run( input_table, k, goodness, run_id, output_schema);
$$ LANGUAGE plpythonu;
COMMIT;