blob: 40ac984c00b4547d1557824d77270067fb05a2a6 [file] [log] [blame]
\qecho =================================================================
\qecho === Running: k-means clustering =================================
\qecho =================================================================
\qecho
set search_path="$user",madlib,public;
set client_min_messages=warning;
DROP SCHEMA madlib_kmeans_test CASCADE;
---------------------------------------------------------------------------
-- K-means unit test : setup
---------------------------------------------------------------------------
CREATE SCHEMA madlib_kmeans_test;
---------------------------------------------------------------------------
-- Random_Array: Generates random array
---------------------------------------------------------------------------
CREATE FUNCTION madlib_kmeans_test.random_array (
size INTEGER, class INTEGER, total_classes INTEGER, sparsity FLOAT
) RETURNS FLOAT[] AS $$
declare
g FLOAT[];
begin
FOR i IN 1..size LOOP
g[i] = 0;
END LOOP;
FOR i IN (1+(size*class/total_classes))..((size*class/total_classes)+(size/total_classes)) LOOP
IF (sparsity > random()) THEN
g[i] = random();
END IF;
END LOOP;
RETURN g;
end
$$ language plpgsql;
---------------------------------------------------------------------------
-- RNorm: Generates random number from normal distribution of (mean, sd)
---------------------------------------------------------------------------
CREATE FUNCTION madlib_kmeans_test.rnorm( mean FLOAT, sd FLOAT) RETURNS FLOAT AS $$
declare
begin
RETURN (|/abs(-log(random()*(|/ (2*pi()*sd*sd)))*2*sd*sd))*(sign(random()-.5)) + mean;
end
$$ language plpgsql;
---------------------------------------------------------------------------
-- Gaussian_Sparse_Array: Generates an SVEC of ...
---------------------------------------------------------------------------
CREATE FUNCTION madlib_kmeans_test.gaussian_sparse_array( center FLOAT[], sparsity FLOAT) RETURNS madlib.svec AS $$
declare
g FLOAT[];
sd FLOAT := 1;
begin
FOR i IN 1..array_upper(center,1) LOOP
IF (center[i] > 0) THEN
g[i] = madlib_kmeans_test.RNorm(center[i], sd);
ELSE
g[i] = 0;
END IF;
END LOOP;
RETURN madlib.svec_cast_float8arr(g);
end
$$ language plpgsql;
---------------------------------------------------------------------------
-- min: minimum of two FLOATs
---------------------------------------------------------------------------
-- DROP FUNCTION IF EXISTS madlib_kmeans_test.min( FLOAT, FLOAT);
CREATE FUNCTION madlib_kmeans_test.min( FLOAT, FLOAT) RETURNS FLOAT AS $$
declare
begin
IF ($1 > $2) THEN
RETURN $2;
ELSIF ($2 > $1) THEN
RETURN $1;
ELSIF ($1 = $2) THEN
RETURN $1;
ELSE
RETURN NULL;
END IF;
end;
$$ language plpgsql;
---------------------------------------------------------------------------
-- CreateTestTable:
-- Creates a table and populates it with random points for k-means testing.
-- Table structure: (pid BIGINT, position SVEC)
---------------------------------------------------------------------------
-- DROP FUNCTION IF EXISTS madlib_kmeans_test.create_test_table(
-- p_table TEXT, p_num int, p_dim int, p_nr_of_clusters int, p_sparsity float
-- );
CREATE FUNCTION madlib_kmeans_test.create_test_table(
p_table TEXT, p_num int, p_dim int, p_nr_of_clusters int, p_sparsity float
) RETURNS TEXT AS $$
declare
vector FLOAT[];
sparsity FLOAT;
check BIGINT;
begin
RAISE INFO 'Creating a source table with % points.', p_num;
-- Create the Target table
EXECUTE 'CREATE TABLE ' || p_table || '(
pid BIGINT,
position madlib.SVEC
)';
-- Create Temp table
CREATE TEMP TABLE temp_kmeans_input(
pid BIGINT,
position madlib.SVEC
);
sparsity = p_sparsity * p_nr_of_clusters;
-- Load some data
FOR i in 1..p_num LOOP
IF (i % 1000 = 1) THEN
RAISE INFO '[ % ]', i;
END IF;
IF (i % (p_num/p_nr_of_clusters) < 2) THEN
vector = madlib_kmeans_test.Random_Array( p_dim, CAST(floor((i-1)/(p_num/p_nr_of_clusters)) AS INTEGER), p_nr_of_clusters, sparsity);
END IF;
INSERT INTO temp_kmeans_input VALUES(i, madlib_kmeans_test.Gaussian_Sparse_Array(vector, sparsity));
END LOOP;
-- Move data from Temp to Target table
EXECUTE 'INSERT INTO ' || p_table || ' SELECT * FROM temp_kmeans_input';
RETURN 'Test table populated with ' || p_num || ' points (' || p_table || ').';
end
$$ language plpgsql;
\qecho === create test table ===========================================
drop table if exists madlib_kmeans_test.test_table;
select madlib_kmeans_test.create_test_table( 'madlib_kmeans_test.test_table', 1000, 100, 20, 0.1);
\qecho === run k-means algorythm =======================================
select madlib.kmeans_run( 'madlib_kmeans_test.test_table', 20, 1, 'mytest', 'madlib_kmeans_test');