blob: 6e86c82aa9600d056727d6640764925ea48af1af [file] [log] [blame]
m4_include(`SQLCommon.m4')
/* -----------------------------------------------------------------------------
* Test k-means.
*
* FIXME: Verify results
* -------------------------------------------------------------------------- */
CREATE TABLE kmeans_2d(
id SERIAL,
x DOUBLE PRECISION,
y DOUBLE PRECISION,
position DOUBLE PRECISION[]
) m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (id)');
INSERT INTO kmeans_2d(x, y, position)
SELECT
x, y,
ARRAY[
x + random() * 15.0,
y + random() * 15.0
]::DOUBLE PRECISION[] AS position
FROM (
SELECT
random() * 100.0 AS x,
random() * 100.0 AS y
FROM generate_series(1,10)
) AS centroids, generate_series(1,100) i;
CREATE TABLE centroids AS
SELECT x,y,position
FROM kmeans_2d
ORDER BY random()
LIMIT 10;
SELECT * FROM kmeanspp('kmeans_2d', 'position', 10);
SELECT * FROM kmeanspp('kmeans_2d', 'position', 10,
'MADLIB_SCHEMA.squared_dist_norm2'::VARCHAR,
'MADLIB_SCHEMA.avg'::VARCHAR, 20::INTEGER,
0.001::DOUBLE PRECISION, 0.1::DOUBLE PRECISION );
SELECT * FROM kmeanspp('kmeans_2d', 'position', 10,
'MADLIB_SCHEMA.squared_dist_norm2'::VARCHAR,
'MADLIB_SCHEMA.avg'::VARCHAR, 20::INTEGER,
0.001::DOUBLE PRECISION, 1.5::DOUBLE PRECISION );
SELECT * FROM kmeans_random('kmeans_2d', 'position', 10);
SELECT * FROM kmeans('kmeans_2d', 'position', 'centroids', 'position');
SELECT * FROM kmeans('kmeans_2d', 'position', ARRAY[
ARRAY[10,10],
ARRAY[20,20],
ARRAY[30,30],
ARRAY[40,40],
ARRAY[50,50],
ARRAY[60,60],
ARRAY[70,70],
ARRAY[80,80],
ARRAY[90,90],
ARRAY[10,10]
]::DOUBLE PRECISION[][]);
SELECT * FROM kmeans('kmeans_2d', 'position', 'centroids', 'position', 'MADLIB_SCHEMA.dist_norm1');
SELECT * FROM kmeans('kmeans_2d', 'position', 'centroids', 'position', 'MADLIB_SCHEMA.dist_norm2');
SELECT * FROM kmeans('kmeans_2d', 'array[x,y]', 'centroids', 'array[x,y]');
SELECT * FROM kmeanspp('kmeans_2d', 'array[x,y]', 10);
SELECT * FROM kmeans_random('kmeans_2d', 'arRAy [ x,y]', 10);
DROP TABLE IF EXISTS km_sample;
CREATE TABLE km_sample(pid int, points double precision[]);
COPY km_sample (pid, points) FROM stdin DELIMITER '|';
1 | {14.23, 1.71, 2.43, 15.6, 127, 2.8, 3.0600, 0.2800, 2.29, 5.64, 1.04, 3.92, 1065}
2 | {13.2, 1.78, 2.14, 11.2, 1, 2.65, 2.76, 0.26, 1.28, 4.38, 1.05, 3.49, 1050}
3 | {13.16, 2.36, 2.67, 18.6, 101, 2.8, 3.24, 0.3, 2.81, 5.6799, 1.03, 3.17, 1185}
4 | {14.37, 1.95, 2.5, 16.8, 113, 3.85, 3.49, 0.24, 2.18, 7.8, 0.86, 3.45, 1480}
5 | {13.24, 2.59, 2.87, 21, 118, 2.8, 2.69, 0.39, 1.82, 4.32, 1.04, 2.93, 735}
6 | {14.2, 1.76, 2.45, 15.2, 112, 3.27, 3.39, 0.34, 1.97, 6.75, 1.05, 2.85, 1450}
7 | {14.39, 1.87, 2.45, 14.6, 96, 2.5, 2.52, 0.3, 1.98, 5.25, 1.02, 3.58, 1290}
8 | {14.06, 2.15, 2.61, 17.6, 121, 2.6, 2.51, 0.31, 1.25, 5.05, 1.06, 3.58, 1295}
9 | {14.83, 1.64, 2.17, 14, 97, 2.8, 2.98, 0.29, 1.98, 5.2, 1.08, 2.85, 1045}
10 | {13.86, 1.35, 2.27, 16, 98, 2.98, 3.15, 0.22, 1.8500, 7.2199, 1.01, NULL, 1045}
\.
-- Test kmeanspp_auto
DROP TABLE IF EXISTS autokm_out,autokm_out_summary;
SELECT * FROM kmeanspp_auto('kmeans_2d', 'autokm_out', 'array[x,y]', ARRAY[2,3,4,5,6,7,8], 'MADLIB_SCHEMA.squared_dist_norm2',
'MADLIB_SCHEMA.avg', 20, 0.001, 0.1, 'elbow');
SELECT assert(
elbow > 0 AND objective_fn > 0,
'Kmeans: Auto Kmeans_pp failed for elbow.')
FROM autokm_out_summary;
DROP TABLE IF EXISTS autokm_out,autokm_out_summary;
SELECT * FROM kmeanspp_auto('kmeans_2d', 'autokm_out', 'array[x,y]', ARRAY[2,3,4,5,6,7,8], 'MADLIB_SCHEMA.squared_dist_norm2',
'MADLIB_SCHEMA.avg', 20, 0.001, 0.1, 'silhouette');
SELECT assert(
silhouette > 0 AND objective_fn > 0,
'Kmeans: Auto Kmeans_pp failed for silhouette.')
FROM autokm_out_summary;
DROP TABLE IF EXISTS autokm_out,autokm_out_summary;
SELECT * FROM kmeanspp_auto('kmeans_2d', 'autokm_out', 'array[x,y]', ARRAY[2,3,4,5,6,7,8], 'MADLIB_SCHEMA.squared_dist_norm2',
'MADLIB_SCHEMA.avg', 20, 0.001, 0.1, 'both');
SELECT assert(
silhouette > 0 AND objective_fn > 0,
'Kmeans: Auto Kmeans_pp failed for both.')
FROM autokm_out_summary;
DROP TABLE IF EXISTS autokm_out,autokm_out_summary;
SELECT * FROM kmeanspp_auto('kmeans_2d', 'autokm_out', 'array[x,y]', ARRAY[2,3,4,5,6,7,8]);
SELECT assert(
silhouette > 0 AND objective_fn > 0,
'Kmeans: Auto Kmeans_pp failed for default.')
FROM autokm_out_summary;
-- Test kmeans_random_auto
DROP TABLE IF EXISTS autokm_out,autokm_out_summary;
SELECT * FROM kmeans_random_auto('kmeans_2d', 'autokm_out', 'array[x,y]', ARRAY[2,3,4,5,6,7,8], 'MADLIB_SCHEMA.squared_dist_norm2',
'MADLIB_SCHEMA.avg', 20, 0.001, 'elbow');
SELECT assert(
elbow > 0 AND objective_fn > 0,
'Kmeans: Auto Kmeans_random failed for elbow.')
FROM autokm_out_summary;
DROP TABLE IF EXISTS autokm_out,autokm_out_summary;
SELECT * FROM kmeans_random_auto('kmeans_2d', 'autokm_out', 'array[x,y]', ARRAY[2,3,4,5,6,7,8], 'MADLIB_SCHEMA.squared_dist_norm2',
'MADLIB_SCHEMA.avg', 20, 0.001, 'El');
SELECT assert(
elbow > 0 AND objective_fn > 0,
'Kmeans: Auto Kmeans_random failed for elbow.')
FROM autokm_out_summary;
DROP TABLE IF EXISTS autokm_out,autokm_out_summary;
SELECT * FROM kmeans_random_auto('kmeans_2d', 'autokm_out', 'array[x,y]', ARRAY[5,6,7,8]);
SELECT assert(
silhouette > 0 AND objective_fn > 0,
'Kmeans: Auto Kmeans_random failed for default.')
FROM autokm_out_summary;
SELECT assert(count(*) = 4, 'Kmeans: Auto Kmeans_random output has incorrect number of rows')
FROM (SELECT * FROM autokm_out WHERE k = any(ARRAY[5,6,7,8]))q;
-- Unordered k list test
DROP TABLE IF EXISTS autokm_out,autokm_out_summary;
SELECT * FROM kmeans_random_auto('kmeans_2d', 'autokm_out', 'array[x,y]', ARRAY[12,3,5,6,8], 'MADLIB_SCHEMA.squared_dist_norm2',
'MADLIB_SCHEMA.avg', 20, 0.001, 'silhouette');
-- Silhouette Tests
DROP TABLE IF EXISTS km_sample_out, silh_out;
CREATE TABLE km_sample_out AS
SELECT * FROM kmeanspp('km_sample', 'points', 2,
'MADLIB_SCHEMA.squared_dist_norm2',
'MADLIB_SCHEMA.avg', 20, 0.001);
-- Test simple_silhouette_points full interface
SELECT * FROM simple_silhouette_points('km_sample', 'silh_out', 'pid', 'points',
'km_sample_out', 'centroids',
'MADLIB_SCHEMA.squared_dist_norm2');
SELECT assert(silh > 0, 'Incorrect silhouette value')
FROM silh_out
WHERE silh IS NOT NULL;
DROP TABLE IF EXISTS silh_out;
-- Test simple_silhouette_points default distance func
SELECT * FROM simple_silhouette_points(
'km_sample', 'silh_out', 'pid', 'points',
'km_sample_out', 'centroids');
SELECT assert(count(*) = 9, 'Incorrect silhouette count')
FROM silh_out
WHERE silh IS NOT NULL;
DROP TABLE IF EXISTS silh_out;
-- Test simple_silhouette_points double precision array centroids
SELECT * FROM simple_silhouette_points(
'km_sample', 'silh_out', 'pid', 'points',
(SELECT centroids FROM km_sample_out));
SELECT assert(silh > 0, 'Incorrect silhouette value')
FROM silh_out
WHERE silh IS NOT NULL;
SELECT assert(
silhouette > 0 AND objective_fn > 0,
'Kmeans: Auto Kmeans_random failed for silhouette on unordered k vals')
FROM autokm_out_summary;
DROP TABLE IF EXISTS autokm_out,autokm_out_summary;
SELECT * FROM kmeans_random_auto('kmeans_2d', 'autokm_out', 'array[x,y]', ARRAY[2,3,4,5,6,7,8], 'MADLIB_SCHEMA.squared_dist_norm2',
'MADLIB_SCHEMA.avg', 20, 0.001, 'silhouetTe');
SELECT assert(
silhouette > 0 AND objective_fn > 0,
'Kmeans: Auto Kmeans_random failed for silhouette.')
FROM autokm_out_summary;
DROP TABLE IF EXISTS autokm_out,autokm_out_summary;
SELECT * FROM kmeans_random_auto('kmeans_2d', 'autokm_out', 'array[x,y]', ARRAY[2,3,4,5,6,7,8], 'MADLIB_SCHEMA.squared_dist_norm2',
'MADLIB_SCHEMA.avg', 20, 0.001, 'siL');
SELECT assert(
silhouette > 0 AND objective_fn > 0,
'Kmeans: Auto Kmeans_random failed for silhouette.')
FROM autokm_out_summary;
DROP TABLE IF EXISTS autokm_out,autokm_out_summary;
SELECT * FROM kmeans_random_auto('kmeans_2d', 'autokm_out', 'array[x,y]', ARRAY[2,3,4,5,6,7,8], 'MADLIB_SCHEMA.squared_dist_norm2',
'MADLIB_SCHEMA.avg', 20, 0.001, 'both');
SELECT assert(
silhouette > 0 AND objective_fn > 0 AND
selection_algorithm = 'silhouette',
'Kmeans: Auto Kmeans_random failed for both.')
FROM autokm_out_summary;
DROP TABLE IF EXISTS autokm_out,autokm_out_summary;
SELECT * FROM kmeans_random_auto('kmeans_2d', 'autokm_out', 'array[x,y]', ARRAY[2,3,4,5,6,7,8], 'MADLIB_SCHEMA.squared_dist_norm2',
'MADLIB_SCHEMA.avg', 20, 0.001, 'b');
SELECT assert(
silhouette > 0 AND objective_fn > 0 AND
selection_algorithm = 'silhouette',
'Kmeans: Auto Kmeans_random failed for both.')
FROM autokm_out_summary;
DROP TABLE IF EXISTS silh_out;
-- Test simple_silhouette_points actual values
SELECT * FROM simple_silhouette_points(
'km_sample', 'silh_out', 'pid', 'points',
ARRAY[[1,1,1,1,1,1,1,1,1,1,1,1,1],
[14.23, 1.71, 2.43, 15.6, 127, 2.8, 3.0600, 0.2800, 2.29, 5.64, 1.04, 3.92, 1065]]::DOUBLE PRECISION[][],'MADLIB_SCHEMA.dist_norm2');
SELECT assert(relative_error(1, silh) < 1e-3,
'Incorrect silhouette value')
FROM silh_out
WHERE pid = 1;
SELECT assert(relative_error(0.8789, silh) < 1e-3,
'Incorrect silhouette value')
FROM silh_out
WHERE pid = 2;
SELECT assert(relative_error(0.8966, silh) < 1e-3,
'Incorrect silhouette value')
FROM silh_out
WHERE pid = 3;
SELECT assert(relative_error(0.7200, silh) < 1e-3,
'Incorrect silhouette value')
FROM silh_out
WHERE pid = 4;
SELECT assert(relative_error(0.5560, silh) < 1e-3,
'Incorrect silhouette value')
FROM silh_out
WHERE pid = 5;
SELECT assert(relative_error(0.7348, silh) < 1e-3,
'Incorrect silhouette value')
FROM silh_out
WHERE pid = 6;
SELECT assert(relative_error(0.8242, silh) < 1e-3,
'Incorrect silhouette value')
FROM silh_out
WHERE pid = 7;
SELECT assert(relative_error(0.8229, silh) < 1e-3,
'Incorrect silhouette value')
FROM silh_out
WHERE pid = 8;
SELECT assert(relative_error(0.9655, silh) < 1e-3,
'Incorrect silhouette value')
FROM silh_out
WHERE pid = 9;
SELECT assert(centroid_id = 1 AND neighbor_centroid_id = 0,
'Incorrect centroid ids')
FROM silh_out;
DROP TABLE IF EXISTS km_sample_out, silh_out;
DROP TABLE IF EXISTS km_sample CASCADE;
DROP TABLE IF EXISTS centroids CASCADE;
DROP TABLE IF EXISTS kmeans_2d CASCADE;