| \qecho ================================================================= |
| \qecho === Running: k-means clustering ================================= |
| \qecho ================================================================= |
| \qecho |
| |
| set search_path="$user",madlib,public; |
| set client_min_messages=warning; |
| |
| DROP SCHEMA madlib_kmeans_test CASCADE; |
| |
| --------------------------------------------------------------------------- |
| -- K-means unit test : setup |
| --------------------------------------------------------------------------- |
| |
| CREATE SCHEMA madlib_kmeans_test; |
| |
| --------------------------------------------------------------------------- |
| -- Random_Array: Generates random array |
| --------------------------------------------------------------------------- |
| CREATE FUNCTION madlib_kmeans_test.random_array ( |
| size INTEGER, class INTEGER, total_classes INTEGER, sparsity FLOAT |
| ) RETURNS FLOAT[] AS $$ |
| declare |
| g FLOAT[]; |
| begin |
| FOR i IN 1..size LOOP |
| g[i] = 0; |
| END LOOP; |
| |
| FOR i IN (1+(size*class/total_classes))..((size*class/total_classes)+(size/total_classes)) LOOP |
| IF (sparsity > random()) THEN |
| g[i] = random(); |
| END IF; |
| END LOOP; |
| |
| RETURN g; |
| end |
| $$ language plpgsql; |
| |
| --------------------------------------------------------------------------- |
| -- RNorm: Generates random number from normal distribution of (mean, sd) |
| --------------------------------------------------------------------------- |
| CREATE FUNCTION madlib_kmeans_test.rnorm( mean FLOAT, sd FLOAT) RETURNS FLOAT AS $$ |
| declare |
| begin |
| RETURN (|/abs(-log(random()*(|/ (2*pi()*sd*sd)))*2*sd*sd))*(sign(random()-.5)) + mean; |
| end |
| $$ language plpgsql; |
| |
| --------------------------------------------------------------------------- |
| -- Gaussian_Sparse_Array: Generates an SVEC of ... |
| --------------------------------------------------------------------------- |
| CREATE FUNCTION madlib_kmeans_test.gaussian_sparse_array( center FLOAT[], sparsity FLOAT) RETURNS madlib.svec AS $$ |
| declare |
| g FLOAT[]; |
| sd FLOAT := 1; |
| begin |
| FOR i IN 1..array_upper(center,1) LOOP |
| IF (center[i] > 0) THEN |
| g[i] = madlib_kmeans_test.RNorm(center[i], sd); |
| ELSE |
| g[i] = 0; |
| END IF; |
| END LOOP; |
| RETURN madlib.svec_cast_float8arr(g); |
| end |
| $$ language plpgsql; |
| |
| --------------------------------------------------------------------------- |
| -- min: minimum of two FLOATs |
| --------------------------------------------------------------------------- |
| -- DROP FUNCTION IF EXISTS madlib_kmeans_test.min( FLOAT, FLOAT); |
| CREATE FUNCTION madlib_kmeans_test.min( FLOAT, FLOAT) RETURNS FLOAT AS $$ |
| declare |
| begin |
| IF ($1 > $2) THEN |
| RETURN $2; |
| ELSIF ($2 > $1) THEN |
| RETURN $1; |
| ELSIF ($1 = $2) THEN |
| RETURN $1; |
| ELSE |
| RETURN NULL; |
| END IF; |
| end; |
| $$ language plpgsql; |
| |
| --------------------------------------------------------------------------- |
| -- CreateTestTable: |
| -- Creates a table and populates it with random points for k-means testing. |
| -- Table structure: (pid BIGINT, position SVEC) |
| --------------------------------------------------------------------------- |
| -- DROP FUNCTION IF EXISTS madlib_kmeans_test.create_test_table( |
| -- p_table TEXT, p_num int, p_dim int, p_nr_of_clusters int, p_sparsity float |
| -- ); |
| CREATE FUNCTION madlib_kmeans_test.create_test_table( |
| p_table TEXT, p_num int, p_dim int, p_nr_of_clusters int, p_sparsity float |
| ) RETURNS TEXT AS $$ |
| declare |
| vector FLOAT[]; |
| sparsity FLOAT; |
| check BIGINT; |
| begin |
| |
| RAISE INFO 'Creating a source table with % points.', p_num; |
| |
| -- Create the Target table |
| EXECUTE 'CREATE TABLE ' || p_table || '( |
| pid BIGINT, |
| position madlib.SVEC |
| )'; |
| |
| -- Create Temp table |
| CREATE TEMP TABLE temp_kmeans_input( |
| pid BIGINT, |
| position madlib.SVEC |
| ); |
| |
| sparsity = p_sparsity * p_nr_of_clusters; |
| |
| -- Load some data |
| FOR i in 1..p_num LOOP |
| IF (i % 1000 = 1) THEN |
| RAISE INFO '[ % ]', i; |
| END IF; |
| IF (i % (p_num/p_nr_of_clusters) < 2) THEN |
| vector = madlib_kmeans_test.Random_Array( p_dim, CAST(floor((i-1)/(p_num/p_nr_of_clusters)) AS INTEGER), p_nr_of_clusters, sparsity); |
| END IF; |
| INSERT INTO temp_kmeans_input VALUES(i, madlib_kmeans_test.Gaussian_Sparse_Array(vector, sparsity)); |
| END LOOP; |
| |
| -- Move data from Temp to Target table |
| EXECUTE 'INSERT INTO ' || p_table || ' SELECT * FROM temp_kmeans_input'; |
| |
| RETURN 'Test table populated with ' || p_num || ' points (' || p_table || ').'; |
| |
| end |
| $$ language plpgsql; |
| |
| \qecho === create test table =========================================== |
| drop table if exists madlib_kmeans_test.test_table; |
| select madlib_kmeans_test.create_test_table( 'madlib_kmeans_test.test_table', 1000, 100, 20, 0.1); |
| |
| \qecho === run k-means algorythm ======================================= |
| select madlib.kmeans_run( 'madlib_kmeans_test.test_table', 20, 1, 'mytest', 'madlib_kmeans_test'); |