| m4_include(`SQLCommon.m4') |
| /* ----------------------------------------------------------------------------- |
| * Test k-means. |
| * |
| * FIXME: Verify results |
| * -------------------------------------------------------------------------- */ |
| |
| CREATE TABLE kmeans_2d( |
| id SERIAL, |
| x DOUBLE PRECISION, |
| y DOUBLE PRECISION, |
| position DOUBLE PRECISION[] |
| ) m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (id)'); |
| |
| INSERT INTO kmeans_2d(x, y, position) |
| SELECT |
| x, y, |
| ARRAY[ |
| x + random() * 15.0, |
| y + random() * 15.0 |
| ]::DOUBLE PRECISION[] AS position |
| FROM ( |
| SELECT |
| random() * 100.0 AS x, |
| random() * 100.0 AS y |
| FROM generate_series(1,10) |
| ) AS centroids, generate_series(1,100) i; |
| |
| CREATE TABLE centroids AS |
| SELECT x,y,position |
| FROM kmeans_2d |
| ORDER BY random() |
| LIMIT 10; |
| |
| SELECT * FROM kmeanspp('kmeans_2d', 'position', 10); |
| |
| SELECT * FROM kmeanspp('kmeans_2d', 'position', 10, |
| 'MADLIB_SCHEMA.squared_dist_norm2'::VARCHAR, |
| 'MADLIB_SCHEMA.avg'::VARCHAR, 20::INTEGER, |
| 0.001::DOUBLE PRECISION, 0.1::DOUBLE PRECISION ); |
| |
| SELECT * FROM kmeanspp('kmeans_2d', 'position', 10, |
| 'MADLIB_SCHEMA.squared_dist_norm2'::VARCHAR, |
| 'MADLIB_SCHEMA.avg'::VARCHAR, 20::INTEGER, |
| 0.001::DOUBLE PRECISION, 1.5::DOUBLE PRECISION ); |
| |
| SELECT * FROM kmeans_random('kmeans_2d', 'position', 10); |
| |
| SELECT * FROM kmeans('kmeans_2d', 'position', 'centroids', 'position'); |
| |
| SELECT * FROM kmeans('kmeans_2d', 'position', ARRAY[ |
| ARRAY[10,10], |
| ARRAY[20,20], |
| ARRAY[30,30], |
| ARRAY[40,40], |
| ARRAY[50,50], |
| ARRAY[60,60], |
| ARRAY[70,70], |
| ARRAY[80,80], |
| ARRAY[90,90], |
| ARRAY[10,10] |
| ]::DOUBLE PRECISION[][]); |
| |
| SELECT * FROM kmeans('kmeans_2d', 'position', 'centroids', 'position', 'MADLIB_SCHEMA.dist_norm1'); |
| SELECT * FROM kmeans('kmeans_2d', 'position', 'centroids', 'position', 'MADLIB_SCHEMA.dist_norm2'); |
| |
| SELECT * FROM kmeans('kmeans_2d', 'array[x,y]', 'centroids', 'array[x,y]'); |
| SELECT * FROM kmeanspp('kmeans_2d', 'array[x,y]', 10); |
| SELECT * FROM kmeans_random('kmeans_2d', 'arRAy [ x,y]', 10); |
| |
| DROP TABLE IF EXISTS km_sample; |
| |
| CREATE TABLE km_sample(pid int, points double precision[]); |
| |
| COPY km_sample (pid, points) FROM stdin DELIMITER '|'; |
| 1 | {14.23, 1.71, 2.43, 15.6, 127, 2.8, 3.0600, 0.2800, 2.29, 5.64, 1.04, 3.92, 1065} |
| 2 | {13.2, 1.78, 2.14, 11.2, 1, 2.65, 2.76, 0.26, 1.28, 4.38, 1.05, 3.49, 1050} |
| 3 | {13.16, 2.36, 2.67, 18.6, 101, 2.8, 3.24, 0.3, 2.81, 5.6799, 1.03, 3.17, 1185} |
| 4 | {14.37, 1.95, 2.5, 16.8, 113, 3.85, 3.49, 0.24, 2.18, 7.8, 0.86, 3.45, 1480} |
| 5 | {13.24, 2.59, 2.87, 21, 118, 2.8, 2.69, 0.39, 1.82, 4.32, 1.04, 2.93, 735} |
| 6 | {14.2, 1.76, 2.45, 15.2, 112, 3.27, 3.39, 0.34, 1.97, 6.75, 1.05, 2.85, 1450} |
| 7 | {14.39, 1.87, 2.45, 14.6, 96, 2.5, 2.52, 0.3, 1.98, 5.25, 1.02, 3.58, 1290} |
| 8 | {14.06, 2.15, 2.61, 17.6, 121, 2.6, 2.51, 0.31, 1.25, 5.05, 1.06, 3.58, 1295} |
| 9 | {14.83, 1.64, 2.17, 14, 97, 2.8, 2.98, 0.29, 1.98, 5.2, 1.08, 2.85, 1045} |
| 10 | {13.86, 1.35, 2.27, 16, 98, 2.98, 3.15, 0.22, 1.8500, 7.2199, 1.01, NULL, 1045} |
| \. |
| |
| -- Test kmeanspp_auto |
| DROP TABLE IF EXISTS autokm_out,autokm_out_summary; |
| SELECT * FROM kmeanspp_auto('kmeans_2d', 'autokm_out', 'array[x,y]', ARRAY[2,3,4,5,6,7,8], 'MADLIB_SCHEMA.squared_dist_norm2', |
| 'MADLIB_SCHEMA.avg', 20, 0.001, 0.1, 'elbow'); |
| |
| SELECT assert( |
| elbow > 0 AND objective_fn > 0, |
| 'Kmeans: Auto Kmeans_pp failed for elbow.') |
| FROM autokm_out_summary; |
| |
| DROP TABLE IF EXISTS autokm_out,autokm_out_summary; |
| SELECT * FROM kmeanspp_auto('kmeans_2d', 'autokm_out', 'array[x,y]', ARRAY[2,3,4,5,6,7,8], 'MADLIB_SCHEMA.squared_dist_norm2', |
| 'MADLIB_SCHEMA.avg', 20, 0.001, 0.1, 'silhouette'); |
| |
| SELECT assert( |
| silhouette > 0 AND objective_fn > 0, |
| 'Kmeans: Auto Kmeans_pp failed for silhouette.') |
| FROM autokm_out_summary; |
| |
| DROP TABLE IF EXISTS autokm_out,autokm_out_summary; |
| SELECT * FROM kmeanspp_auto('kmeans_2d', 'autokm_out', 'array[x,y]', ARRAY[2,3,4,5,6,7,8], 'MADLIB_SCHEMA.squared_dist_norm2', |
| 'MADLIB_SCHEMA.avg', 20, 0.001, 0.1, 'both'); |
| |
| SELECT assert( |
| silhouette > 0 AND objective_fn > 0, |
| 'Kmeans: Auto Kmeans_pp failed for both.') |
| FROM autokm_out_summary; |
| |
| DROP TABLE IF EXISTS autokm_out,autokm_out_summary; |
| SELECT * FROM kmeanspp_auto('kmeans_2d', 'autokm_out', 'array[x,y]', ARRAY[2,3,4,5,6,7,8]); |
| |
| SELECT assert( |
| silhouette > 0 AND objective_fn > 0, |
| 'Kmeans: Auto Kmeans_pp failed for default.') |
| FROM autokm_out_summary; |
| |
| -- Test kmeans_random_auto |
| DROP TABLE IF EXISTS autokm_out,autokm_out_summary; |
| SELECT * FROM kmeans_random_auto('kmeans_2d', 'autokm_out', 'array[x,y]', ARRAY[2,3,4,5,6,7,8], 'MADLIB_SCHEMA.squared_dist_norm2', |
| 'MADLIB_SCHEMA.avg', 20, 0.001, 'elbow'); |
| |
| SELECT assert( |
| elbow > 0 AND objective_fn > 0, |
| 'Kmeans: Auto Kmeans_random failed for elbow.') |
| FROM autokm_out_summary; |
| |
| DROP TABLE IF EXISTS autokm_out,autokm_out_summary; |
| SELECT * FROM kmeans_random_auto('kmeans_2d', 'autokm_out', 'array[x,y]', ARRAY[2,3,4,5,6,7,8], 'MADLIB_SCHEMA.squared_dist_norm2', |
| 'MADLIB_SCHEMA.avg', 20, 0.001, 'El'); |
| |
| SELECT assert( |
| elbow > 0 AND objective_fn > 0, |
| 'Kmeans: Auto Kmeans_random failed for elbow.') |
| FROM autokm_out_summary; |
| |
| DROP TABLE IF EXISTS autokm_out,autokm_out_summary; |
| SELECT * FROM kmeans_random_auto('kmeans_2d', 'autokm_out', 'array[x,y]', ARRAY[5,6,7,8]); |
| |
| SELECT assert( |
| silhouette > 0 AND objective_fn > 0, |
| 'Kmeans: Auto Kmeans_random failed for default.') |
| FROM autokm_out_summary; |
| |
| |
| SELECT assert(count(*) = 4, 'Kmeans: Auto Kmeans_random output has incorrect number of rows') |
| FROM (SELECT * FROM autokm_out WHERE k = any(ARRAY[5,6,7,8]))q; |
| |
| -- Unordered k list test |
| DROP TABLE IF EXISTS autokm_out,autokm_out_summary; |
| SELECT * FROM kmeans_random_auto('kmeans_2d', 'autokm_out', 'array[x,y]', ARRAY[12,3,5,6,8], 'MADLIB_SCHEMA.squared_dist_norm2', |
| 'MADLIB_SCHEMA.avg', 20, 0.001, 'silhouette'); |
| |
| -- Silhouette Tests |
| DROP TABLE IF EXISTS km_sample_out, silh_out; |
| |
| CREATE TABLE km_sample_out AS |
| SELECT * FROM kmeanspp('km_sample', 'points', 2, |
| 'MADLIB_SCHEMA.squared_dist_norm2', |
| 'MADLIB_SCHEMA.avg', 20, 0.001); |
| |
| -- Test simple_silhouette_points full interface |
| SELECT * FROM simple_silhouette_points('km_sample', 'silh_out', 'pid', 'points', |
| 'km_sample_out', 'centroids', |
| 'MADLIB_SCHEMA.squared_dist_norm2'); |
| |
| SELECT assert(silh > 0, 'Incorrect silhouette value') |
| FROM silh_out |
| WHERE silh IS NOT NULL; |
| |
| DROP TABLE IF EXISTS silh_out; |
| -- Test simple_silhouette_points default distance func |
| SELECT * FROM simple_silhouette_points( |
| 'km_sample', 'silh_out', 'pid', 'points', |
| 'km_sample_out', 'centroids'); |
| |
| SELECT assert(count(*) = 9, 'Incorrect silhouette count') |
| FROM silh_out |
| WHERE silh IS NOT NULL; |
| |
| DROP TABLE IF EXISTS silh_out; |
| -- Test simple_silhouette_points double precision array centroids |
| SELECT * FROM simple_silhouette_points( |
| 'km_sample', 'silh_out', 'pid', 'points', |
| (SELECT centroids FROM km_sample_out)); |
| |
| SELECT assert(silh > 0, 'Incorrect silhouette value') |
| FROM silh_out |
| WHERE silh IS NOT NULL; |
| |
| SELECT assert( |
| silhouette > 0 AND objective_fn > 0, |
| 'Kmeans: Auto Kmeans_random failed for silhouette on unordered k vals') |
| FROM autokm_out_summary; |
| |
| DROP TABLE IF EXISTS autokm_out,autokm_out_summary; |
| SELECT * FROM kmeans_random_auto('kmeans_2d', 'autokm_out', 'array[x,y]', ARRAY[2,3,4,5,6,7,8], 'MADLIB_SCHEMA.squared_dist_norm2', |
| 'MADLIB_SCHEMA.avg', 20, 0.001, 'silhouetTe'); |
| |
| SELECT assert( |
| silhouette > 0 AND objective_fn > 0, |
| 'Kmeans: Auto Kmeans_random failed for silhouette.') |
| FROM autokm_out_summary; |
| |
| DROP TABLE IF EXISTS autokm_out,autokm_out_summary; |
| SELECT * FROM kmeans_random_auto('kmeans_2d', 'autokm_out', 'array[x,y]', ARRAY[2,3,4,5,6,7,8], 'MADLIB_SCHEMA.squared_dist_norm2', |
| 'MADLIB_SCHEMA.avg', 20, 0.001, 'siL'); |
| |
| SELECT assert( |
| silhouette > 0 AND objective_fn > 0, |
| 'Kmeans: Auto Kmeans_random failed for silhouette.') |
| FROM autokm_out_summary; |
| |
| DROP TABLE IF EXISTS autokm_out,autokm_out_summary; |
| SELECT * FROM kmeans_random_auto('kmeans_2d', 'autokm_out', 'array[x,y]', ARRAY[2,3,4,5,6,7,8], 'MADLIB_SCHEMA.squared_dist_norm2', |
| 'MADLIB_SCHEMA.avg', 20, 0.001, 'both'); |
| |
| SELECT assert( |
| silhouette > 0 AND objective_fn > 0 AND |
| selection_algorithm = 'silhouette', |
| 'Kmeans: Auto Kmeans_random failed for both.') |
| FROM autokm_out_summary; |
| |
| DROP TABLE IF EXISTS autokm_out,autokm_out_summary; |
| SELECT * FROM kmeans_random_auto('kmeans_2d', 'autokm_out', 'array[x,y]', ARRAY[2,3,4,5,6,7,8], 'MADLIB_SCHEMA.squared_dist_norm2', |
| 'MADLIB_SCHEMA.avg', 20, 0.001, 'b'); |
| |
| SELECT assert( |
| silhouette > 0 AND objective_fn > 0 AND |
| selection_algorithm = 'silhouette', |
| 'Kmeans: Auto Kmeans_random failed for both.') |
| FROM autokm_out_summary; |
| |
| DROP TABLE IF EXISTS silh_out; |
| -- Test simple_silhouette_points actual values |
| SELECT * FROM simple_silhouette_points( |
| 'km_sample', 'silh_out', 'pid', 'points', |
| ARRAY[[1,1,1,1,1,1,1,1,1,1,1,1,1], |
| [14.23, 1.71, 2.43, 15.6, 127, 2.8, 3.0600, 0.2800, 2.29, 5.64, 1.04, 3.92, 1065]]::DOUBLE PRECISION[][],'MADLIB_SCHEMA.dist_norm2'); |
| |
| SELECT assert(relative_error(1, silh) < 1e-3, |
| 'Incorrect silhouette value') |
| FROM silh_out |
| WHERE pid = 1; |
| SELECT assert(relative_error(0.8789, silh) < 1e-3, |
| 'Incorrect silhouette value') |
| FROM silh_out |
| WHERE pid = 2; |
| SELECT assert(relative_error(0.8966, silh) < 1e-3, |
| 'Incorrect silhouette value') |
| FROM silh_out |
| WHERE pid = 3; |
| SELECT assert(relative_error(0.7200, silh) < 1e-3, |
| 'Incorrect silhouette value') |
| FROM silh_out |
| WHERE pid = 4; |
| SELECT assert(relative_error(0.5560, silh) < 1e-3, |
| 'Incorrect silhouette value') |
| FROM silh_out |
| WHERE pid = 5; |
| SELECT assert(relative_error(0.7348, silh) < 1e-3, |
| 'Incorrect silhouette value') |
| FROM silh_out |
| WHERE pid = 6; |
| SELECT assert(relative_error(0.8242, silh) < 1e-3, |
| 'Incorrect silhouette value') |
| FROM silh_out |
| WHERE pid = 7; |
| SELECT assert(relative_error(0.8229, silh) < 1e-3, |
| 'Incorrect silhouette value') |
| FROM silh_out |
| WHERE pid = 8; |
| SELECT assert(relative_error(0.9655, silh) < 1e-3, |
| 'Incorrect silhouette value') |
| FROM silh_out |
| WHERE pid = 9; |
| |
| SELECT assert(centroid_id = 1 AND neighbor_centroid_id = 0, |
| 'Incorrect centroid ids') |
| FROM silh_out; |
| |
| DROP TABLE IF EXISTS km_sample_out, silh_out; |
| DROP TABLE IF EXISTS km_sample CASCADE; |
| DROP TABLE IF EXISTS centroids CASCADE; |
| DROP TABLE IF EXISTS kmeans_2d CASCADE; |