methods/k-means/src/pg_gp/kmeans.sql_in - madlib - Git at Google

 /* ----------------------------------------------------------------------- *//**
  *
  * @file kmeans.sql_in
  *
  * @brief SQL functions for k-means clustering
  * @date January 2011
  *
  * @sa For a brief introduction to k-means clustering, see the module
  *     description \ref grp_kmeans.
  *
  *//* ----------------------------------------------------------------------- */

 /**
 @addtogroup grp_kmeans

 @about

 K-means clustering algorithm helps in dividing a list of objects
 into K groups based on the similarity of their attributes.

 Both objects and groups are represented as points in n-dimensional
 space. The similarity is expressed as distance between objects (points) and
 group centers (centroids). Each object is assigned to a group with
 the nearest centroid in a series of iterations, the goal of which is
 to minimize the total distance between points and their centroids.
 The distance is currently measured using l2-norm,
 also know as
 <a href="http://en.wikipedia.org/wiki/Norm_(mathematics)#Euclidean_norm">Euclidean
 distance</a>.


 This method works on a set of data points accessible in a table or through a view.
 Initial centroids are found according to
 k-means++ algorithm [1].
 Further adjustments are based on the Euclidean distance between
 the current centroids and all available data points or a random subset of them
 , such that there are at least 200 points from each initial cluster.

 The algorithm stops when one of the following conditions is met:
 - fraction of reassigned nodes is not growing
 - fraction of reassigned nodes is smaller than the limit (default = 0.001)
 - reached the maximum number of allowed iterations (default = 20)

 @prereq

 Requires MADlib \link grp_svec svec module\endlink, which provides a sparse
 vector data type.

 @usage

 Function: <tt>kmeans_run( '<em>input_table</em>', <em>k</em>,
    '<em>goodness</em>', '<em>run_id</em>', '<em>output_schema</em>')</tt>

 Parameters:
     - <em>input_table</em> :      table with the source data
     - <em>k</em> :                max number of clusters to consider
                                   (must be smaller than number of input points)
     - <em>goodness</em> :         goodness of fit test flag [0,1]
     - <em>run_id</em> :           execution identifier
     - <em>output_schema</em> :    target schema for output tables (points, centroids)

 @examp

 1) Prepare the input table/view with the following structure. This method
 currently requires the following column names, as well as the data types:

 \code
 	pid 		INTEGER
 	position 	{SVEC|INT[]|FLOAT[]}
 \endcode

 where:
     - <em>pid</em> is the name of the column with an ID of the object (point)
     - <em>position</em> is the attribute vector (point coordinates)

 2) Populate the input table with some data. You can use a helper function (create_test_table) from the sql/setup.sql file.

 3) Call kmeans_run() stored procedure, e.g.:

 \code
 SQL> select madlib.kmeans_run( 'source_schema.source_name', 10, 1, 'my_test_run', 'my_schema');
 \endcode

 4) Sample output:

 \code
 INFO: Parameters:
 INFO:  * k = 10 (number of centroids)
 INFO:  * input_table = source_schema.source_name
 INFO:  * goodness = 1 (GOF test on)
 INFO:  * run_id = my_test_run
 INFO:  * output_schema = my_schema
 INFO: Seeding 10 centroids...
 INFO: Using sample data set for analysis... (9200 out of 10000 points)
 INFO: ...Iteration 1
 INFO: ...Iteration 2
 INFO: Exit reason: fraction of reassigned nodes is smaller than the limit: 0.001
 INFO: Expanding cluster assignment to all points...
 INFO: Calculating goodness of fit...

 Finished k-means clustering on "my_schema"."kmeans_input_table" table.
 Results:
     * 10 centroids (goodness of fit = 2.9787570001).
 Output:
     * table : "my_schema"."kmeans_out_centroids_testrun"
     * table : "my_schema"."kmeans_out_points_testrun"
 Time elapsed: 0 minutes 4.335644 seconds.
 \endcode

 @sa file kmeans.sql_in (documenting the SQL functions)

 @internal
 @sa namespace kmeans (documenting the implementation in Python)
 @endinternal

 @literature

 [1] Wikipedia, K-means++,
     http://en.wikipedia.org/wiki/K-means%2B%2B
 */

 BEGIN;


 /**
  * @internal
  * Support function: takes a single SVEC (A) and an array of SVECs (B)
  * and returns the index of (B) with the shortest distance to (A).
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__kmeans_closestID(
     p_point MADLIB_SCHEMA.SVEC, p_centroids MADLIB_SCHEMA.SVEC[]
 )
 RETURNS INTEGER
 AS $$
 DECLARE
     minCID      INTEGER := 1;
     min_val     FLOAT;
     temp_val    FLOAT;
 BEGIN

     -- Check the arguments
     IF p_point is NULL or p_centroids is NULL THEN
         RETURN null;
     END IF;

     min_val = MADLIB_SCHEMA.l2norm( p_point - p_centroids[1]);

     FOR i IN 2..array_upper( p_centroids, 1)
     LOOP
         temp_val = MADLIB_SCHEMA.l2norm( p_point - p_centroids[i]);
         IF ( temp_val < coalesce( min_val, temp_val + 1) ) THEN
             min_val = temp_val;
             minCID = i;
         END IF;
     END LOOP;

     RETURN minCID;
 END
 $$ LANGUAGE plpgsql;

 -- Finalize function for _kmeans_meanPosition() aggregate.
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__kmeans_mean_finalize( p_centroid MADLIB_SCHEMA.SVEC)
 RETURNS MADLIB_SCHEMA.SVEC AS $$
 DECLARE
     new_location FLOAT[];
     new_location2 FLOAT[];
     sum FLOAT;
 BEGIN
     new_location = MADLIB_SCHEMA.SVEC_return_array(p_centroid);
     sum = new_location[array_upper(new_location, 1)];
     FOR i in 1..(array_upper(new_location, 1)-1) LOOP
         new_location2[i] = new_location[i]/sum;
     END LOOP;
     RETURN MADLIB_SCHEMA.SVEC_cast_float8arr(new_location2);
 END
 $$ LANGUAGE plpgsql;

 -- Sfunc function for _kmeans_meanPosition() aggregate.
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__kmeans_mean_product(MADLIB_SCHEMA.SVEC, MADLIB_SCHEMA.SVEC)
 RETURNS MADLIB_SCHEMA.SVEC AS $$
 DECLARE
     new_location MADLIB_SCHEMA.SVEC;
 BEGIN
     new_location = MADLIB_SCHEMA.SVEC_concat($2,MADLIB_SCHEMA.SVEC_cast_float8(1.0));
     IF ($1 IS NOT NULL) THEN
         new_location = $1 + new_location;
     END IF;
     RETURN new_location;
 END
 $$ LANGUAGE plpgsql;

 -- Prefunc function for _kmeans_meanPosition() aggregate.
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__kmeans_mean_aggr( MADLIB_SCHEMA.SVEC, MADLIB_SCHEMA.SVEC) RETURNS MADLIB_SCHEMA.SVEC AS $$
 DECLARE
 BEGIN
     IF (($1 IS NOT NULL) AND ($2 IS NOT NULL)) THEN
         RETURN $1 + $2;
     END IF;
     IF ($1 IS NOT NULL) THEN
         RETURN $1;
     END IF;
     IF ($2 IS NOT NULL) THEN
         RETURN $2;
     END IF;
 END
 $$ LANGUAGE plpgsql;

 /**
  * @internal
  * @brief Compute a mean position for a set of SVECs.
  */
 CREATE AGGREGATE MADLIB_SCHEMA.__kmeans_meanPosition( MADLIB_SCHEMA.SVEC )
 (
   stype = MADLIB_SCHEMA.SVEC,
   sfunc = MADLIB_SCHEMA.__kmeans_mean_product,
   ifdef(`GREENPLUM',`prefunc = MADLIB_SCHEMA.__kmeans_mean_aggr,')
   finalfunc = MADLIB_SCHEMA.__kmeans_mean_finalize
 );

 /**
  * @brief Compute a k-means clustering
  *
  * @param input_table Name of relation containing the input data points
  * @param k Number of centroids to generate
  * @param goodness Goodness of fit test flag (allowed values: 0, 1)
  * @param run_id Name/ID of the execution
  * @param output_schema Target schema for the output tables
  * @return Textual summary of the algorithm run, including the names of the
  *     created tables and run time statistics
  *
  * @internal
  * @sa This function is a wrapper for kmeans::kmeans_run()
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeans_run( input_table text, k int, goodness int, run_id text, output_schema text)
   RETURNS text
 AS $$
     import sys
     try:
         from madlib import kmeans
     except:
         sys.path.append("PLPYTHON_LIBDIR")
         from madlib import kmeans

     plpy.execute( 'set client_min_messages=warning');
     return kmeans.kmeans_run( input_table, k, goodness, run_id, output_schema);
 $$ LANGUAGE plpythonu;

 COMMIT;
	/* ----------------------------------------------------------------------- //*
	*
	* @file kmeans.sql_in
	*
	* @brief SQL functions for k-means clustering
	* @date January 2011
	*
	* @sa For a brief introduction to k-means clustering, see the module
	* description \ref grp_kmeans.
	*
	// ----------------------------------------------------------------------- */

	/**
	@addtogroup grp_kmeans

	@about

	K-means clustering algorithm helps in dividing a list of objects
	into K groups based on the similarity of their attributes.

	Both objects and groups are represented as points in n-dimensional
	space. The similarity is expressed as distance between objects (points) and
	group centers (centroids). Each object is assigned to a group with
	the nearest centroid in a series of iterations, the goal of which is
	to minimize the total distance between points and their centroids.
	The distance is currently measured using l2-norm,
	also know as
	<a href="http://en.wikipedia.org/wiki/Norm_(mathematics)#Euclidean_norm">Euclidean
	distance</a>.


	This method works on a set of data points accessible in a table or through a view.
	Initial centroids are found according to
	k-means++ algorithm [1].
	Further adjustments are based on the Euclidean distance between
	the current centroids and all available data points or a random subset of them
	, such that there are at least 200 points from each initial cluster.

	The algorithm stops when one of the following conditions is met:
	- fraction of reassigned nodes is not growing
	- fraction of reassigned nodes is smaller than the limit (default = 0.001)
	- reached the maximum number of allowed iterations (default = 20)

	@prereq

	Requires MADlib \link grp_svec svec module\endlink, which provides a sparse
	vector data type.

	@usage

	Function: <tt>kmeans_run( '<em>input_table</em>', <em>k</em>,
	'<em>goodness</em>', '<em>run_id</em>', '<em>output_schema</em>')</tt>

	Parameters:
	- <em>input_table</em> : table with the source data
	- <em>k</em> : max number of clusters to consider
	(must be smaller than number of input points)
	- <em>goodness</em> : goodness of fit test flag [0,1]
	- <em>run_id</em> : execution identifier
	- <em>output_schema</em> : target schema for output tables (points, centroids)

	@examp

	1) Prepare the input table/view with the following structure. This method
	currently requires the following column names, as well as the data types:

	\code
	pid INTEGER
	position {SVEC\|INT[]\|FLOAT[]}
	\endcode

	where:
	- <em>pid</em> is the name of the column with an ID of the object (point)
	- <em>position</em> is the attribute vector (point coordinates)

	2) Populate the input table with some data. You can use a helper function (create_test_table) from the sql/setup.sql file.

	3) Call kmeans_run() stored procedure, e.g.:

	\code
	SQL> select madlib.kmeans_run( 'source_schema.source_name', 10, 1, 'my_test_run', 'my_schema');
	\endcode

	4) Sample output:

	\code
	INFO: Parameters:
	INFO: * k = 10 (number of centroids)
	INFO: * input_table = source_schema.source_name
	INFO: * goodness = 1 (GOF test on)
	INFO: * run_id = my_test_run
	INFO: * output_schema = my_schema
	INFO: Seeding 10 centroids...
	INFO: Using sample data set for analysis... (9200 out of 10000 points)
	INFO: ...Iteration 1
	INFO: ...Iteration 2
	INFO: Exit reason: fraction of reassigned nodes is smaller than the limit: 0.001
	INFO: Expanding cluster assignment to all points...
	INFO: Calculating goodness of fit...

	Finished k-means clustering on "my_schema"."kmeans_input_table" table.
	Results:
	* 10 centroids (goodness of fit = 2.9787570001).
	Output:
	* table : "my_schema"."kmeans_out_centroids_testrun"
	* table : "my_schema"."kmeans_out_points_testrun"
	Time elapsed: 0 minutes 4.335644 seconds.
	\endcode

	@sa file kmeans.sql_in (documenting the SQL functions)

	@internal
	@sa namespace kmeans (documenting the implementation in Python)
	@endinternal

	@literature

	[1] Wikipedia, K-means++,
	http://en.wikipedia.org/wiki/K-means%2B%2B
	*/

	BEGIN;


	/**
	* @internal
	* Support function: takes a single SVEC (A) and an array of SVECs (B)
	* and returns the index of (B) with the shortest distance to (A).
	*/
	CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__kmeans_closestID(
	p_point MADLIB_SCHEMA.SVEC, p_centroids MADLIB_SCHEMA.SVEC[]
	)
	RETURNS INTEGER
	AS $$
	DECLARE
	minCID INTEGER := 1;
	min_val FLOAT;
	temp_val FLOAT;
	BEGIN

	-- Check the arguments
	IF p_point is NULL or p_centroids is NULL THEN
	RETURN null;
	END IF;

	min_val = MADLIB_SCHEMA.l2norm( p_point - p_centroids[1]);

	FOR i IN 2..array_upper( p_centroids, 1)
	LOOP
	temp_val = MADLIB_SCHEMA.l2norm( p_point - p_centroids[i]);
	IF ( temp_val < coalesce( min_val, temp_val + 1) ) THEN
	min_val = temp_val;
	minCID = i;
	END IF;
	END LOOP;

	RETURN minCID;
	END
	$$ LANGUAGE plpgsql;

	-- Finalize function for _kmeans_meanPosition() aggregate.
	CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__kmeans_mean_finalize( p_centroid MADLIB_SCHEMA.SVEC)
	RETURNS MADLIB_SCHEMA.SVEC AS $$
	DECLARE
	new_location FLOAT[];
	new_location2 FLOAT[];
	sum FLOAT;
	BEGIN
	new_location = MADLIB_SCHEMA.SVEC_return_array(p_centroid);
	sum = new_location[array_upper(new_location, 1)];
	FOR i in 1..(array_upper(new_location, 1)-1) LOOP
	new_location2[i] = new_location[i]/sum;
	END LOOP;
	RETURN MADLIB_SCHEMA.SVEC_cast_float8arr(new_location2);
	END
	$$ LANGUAGE plpgsql;

	-- Sfunc function for _kmeans_meanPosition() aggregate.
	CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__kmeans_mean_product(MADLIB_SCHEMA.SVEC, MADLIB_SCHEMA.SVEC)
	RETURNS MADLIB_SCHEMA.SVEC AS $$
	DECLARE
	new_location MADLIB_SCHEMA.SVEC;
	BEGIN
	new_location = MADLIB_SCHEMA.SVEC_concat($2,MADLIB_SCHEMA.SVEC_cast_float8(1.0));
	IF ($1 IS NOT NULL) THEN
	new_location = $1 + new_location;
	END IF;
	RETURN new_location;
	END
	$$ LANGUAGE plpgsql;

	-- Prefunc function for _kmeans_meanPosition() aggregate.
	CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__kmeans_mean_aggr( MADLIB_SCHEMA.SVEC, MADLIB_SCHEMA.SVEC) RETURNS MADLIB_SCHEMA.SVEC AS $$
	DECLARE
	BEGIN
	IF (($1 IS NOT NULL) AND ($2 IS NOT NULL)) THEN
	RETURN $1 + $2;
	END IF;
	IF ($1 IS NOT NULL) THEN
	RETURN $1;
	END IF;
	IF ($2 IS NOT NULL) THEN
	RETURN $2;
	END IF;
	END
	$$ LANGUAGE plpgsql;

	/**
	* @internal
	* @brief Compute a mean position for a set of SVECs.
	*/
	CREATE AGGREGATE MADLIB_SCHEMA.__kmeans_meanPosition( MADLIB_SCHEMA.SVEC )
	(
	stype = MADLIB_SCHEMA.SVEC,
	sfunc = MADLIB_SCHEMA.__kmeans_mean_product,
	ifdef(`GREENPLUM',`prefunc = MADLIB_SCHEMA.__kmeans_mean_aggr,')
	finalfunc = MADLIB_SCHEMA.__kmeans_mean_finalize
	);

	/**
	* @brief Compute a k-means clustering
	*
	* @param input_table Name of relation containing the input data points
	* @param k Number of centroids to generate
	* @param goodness Goodness of fit test flag (allowed values: 0, 1)
	* @param run_id Name/ID of the execution
	* @param output_schema Target schema for the output tables
	* @return Textual summary of the algorithm run, including the names of the
	* created tables and run time statistics
	*
	* @internal
	* @sa This function is a wrapper for kmeans::kmeans_run()
	*/
	CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeans_run( input_table text, k int, goodness int, run_id text, output_schema text)
	RETURNS text
	AS $$
	import sys
	try:
	from madlib import kmeans
	except:
	sys.path.append("PLPYTHON_LIBDIR")
	from madlib import kmeans

	plpy.execute( 'set client_min_messages=warning');
	return kmeans.kmeans_run( input_table, k, goodness, run_id, output_schema);
	$$ LANGUAGE plpythonu;

	COMMIT;