blob: e99b70677d9bdda9a65ee0ed37762fa024e207ea [file] [log] [blame]
/* ----------------------------------------------------------------------- *//**
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
*//* ----------------------------------------------------------------------- */
DROP TABLE IF EXISTS "TEST_s" CASCADE;
CREATE TABLE "TEST_s"(
id1 INTEGER,
id2 INTEGER,
gr1 INTEGER,
gr2 INTEGER,
gr3 TEXT
);
INSERT INTO "TEST_s" VALUES
(1,0,1,1,'a'),
(1,0,1,1,'b'),
(1,0,1,2,'b'),
(1,0,1,2,'c'),
(1,0,1,2,'d'),
(1,0,1,2,'e'),
(1,0,1,5,'c'),
(1,0,1,6,'c'),
(2,0,1,6,'d'),
(3,0,1,1,'a'),
(4,0,1,1,'a'),
(5,0,1,1,'a'),
(6,0,1,1,'a'),
(7,0,1,1,'b'),
(8,0,1,1,'b'),
(9,0,1,1,'b'),
(19,0,1,1,'b'),
(29,0,1,1,'b'),
(39,0,1,1,'b'),
(0,1,1,2,'b'),
(0,2,1,2,'b'),
(0,3,1,2,'b'),
(0,4,1,2,'b'),
(0,5,1,2,'b'),
(0,6,1,2,'b'),
(10,10,2,2,'c'),
(20,20,2,2,'c'),
(30,30,2,2,'c'),
(40,40,2,2,'c'),
(50,50,2,2,'c'),
(60,60,2,2,'c'),
(70,70,2,2,'c'),
(10,10,5,5,'c'),
(50,50,5,5,'c'),
(88,88,5,5,'c'),
(40,40,5,6,'c'),
(50,50,5,6,'c'),
(60,60,5,6,'c'),
(70,70,5,6,'c'),
(10,10,6,6,'c'),
(60,60,6,6,'c'),
(30,30,6,6,'d'),
(40,40,6,6,'d'),
(50,50,6,6,'d'),
(60,60,6,6,'d'),
(70,70,6,6,'d'),
(50,50,4,2,'d'),
(60,60,4,2,'d'),
(70,70,4,2,'d'),
(50,50,3,2,'d'),
(60,60,3,2,'d'),
(70,70,3,2,'d'),
(500,50,NULL,2,'e'),
(600,60,NULL,2,'e'),
(700,70,NULL,2,'e')
;
-- SELECT gr1, count(*) AS c FROM "TEST_s" GROUP BY gr1;
-- gr1 | c
-- -----+----
-- 4 | 3
-- 1 | 18
-- 5 | 7
-- 3 | 3
-- 6 | 7
-- 2 | 7
-- NULL | 3
-- (6 rows)
SELECT gr1, count(*) AS c FROM "TEST_s" GROUP BY gr1;
-- --- Test for random undersampling with replacement
DROP TABLE IF EXISTS out_sr2;
SELECT balance_sample('"TEST_s"', 'out_sr2', 'gr1', 'undersample ', NULL, NULL, TRUE, TRUE);
SELECT gr1, count(*) AS c FROM out_sr2 GROUP BY gr1;
-- SELECT gr1, count(*) AS c FROM out_sr2 GROUP BY gr1;
SELECT assert(count(*) = 0, 'Wrong number of samples on undersampling with replacement on gr1') FROM
(SELECT gr1, count(*) AS c FROM out_sr2 GROUP BY gr1) AS foo WHERE foo.c != 3;
--- Test for random undersampling without replacement
DROP TABLE IF EXISTS out_s;
SELECT balance_sample('"TEST_s"', 'out_s', 'gr1', 'undersample', NULL, 'gr2, gr3', FALSE);
SELECT * FROM out_s;
SELECT gr2, gr3, count(*) AS c FROM out_s GROUP BY gr3, gr2 ORDER BY gr2, gr3;
DROP TABLE IF EXISTS out_s;
SELECT balance_sample('"TEST_s"', 'out_s', 'gr1', 'oversample', NULL, 'gr2, gr3', FALSE);
SELECT * FROM out_s;
SELECT gr2, gr3, count(*) AS c FROM out_s GROUP BY gr3, gr2 ORDER BY gr2, gr3;
DROP TABLE IF EXISTS out_s;
SELECT balance_sample('"TEST_s"', 'out_s', 'gr1', 'uniform', NULL, 'gr2, gr3', FALSE);
SELECT * FROM out_s;
SELECT gr2, gr3, count(*) AS c FROM out_s GROUP BY gr3, gr2 ORDER BY gr2, gr3;
DROP TABLE IF EXISTS out_s;
SELECT balance_sample('"TEST_s"', 'out_s', 'gr1', '1=3', NULL, 'gr2, gr3', FALSE);
SELECT * FROM out_s;
SELECT gr2, gr3, count(*) AS c FROM out_s GROUP BY gr3, gr2 ORDER BY gr2, gr3;
-- SELECT assert(count(*) = 0, 'Wrong number of samples on undersampling gr1') FROM
-- (SELECT gr1, gr2, count(*) AS c FROM out_s GROUP BY gr1, gr2) AS foo WHERE foo.c != 3;
-- --- Test for random oversampling
DROP TABLE IF EXISTS out_or3;
SELECT balance_sample('"TEST_s"', 'out_or3', 'gr1', ' oVEr ', NULL, NULL);
SELECT gr1, count(*) AS c FROM out_or3 GROUP BY gr1;
SELECT assert(count(*) = 0, 'Wrong number of samples on oversampling') FROM
(SELECT gr1, count(*) AS c FROM out_or3 GROUP BY gr1) AS foo WHERE foo.c != 25;
--- UNIFORM sampling
DROP TABLE IF EXISTS out_cd2;
SELECT balance_sample('"TEST_s"', 'out_cd2', 'gr1', 'Uniform', NULL, NULL);
SELECT gr1, count(*) AS c FROM out_cd2 GROUP BY gr1;
SELECT assert(count(*) = 0, 'Wrong number of samples on uniform sampling for gr1') FROM
(SELECT gr1, count(*) AS c FROM out_cd2 GROUP BY gr1) AS foo WHERE foo.c != 9;
--- Default sampling should be uniform
DROP TABLE IF EXISTS out_cd3;
SELECT balance_sample('"TEST_s"', 'out_cd3', 'gr1', NULL, 100, NULL, NULL, TRUE);
SELECT gr1, count(*) AS c FROM out_cd3 GROUP BY gr1;
SELECT assert(count(*) = 0, 'Wrong number of samples on uniform sampling for gr1') FROM
(SELECT gr1, count(*) AS c FROM out_cd3 GROUP BY gr1) AS foo WHERE foo.c != 15;
--- Only one class size is specified
DROP TABLE IF EXISTS out_cd4;
SELECT balance_sample('"TEST_s"', 'out_cd4', 'gr1', '2=10', NULL, NULL, TRUE);
SELECT gr1, count(*) AS c FROM out_cd4 GROUP BY gr1;
SELECT assert(count(*) = 10, 'Wrong number of samples on sampling with specified class sizes') from
out_cd4 where gr1 = 2;
SELECT assert(count(*) = 3, 'Wrong number of samples on sampling with specified class sizes') from
out_cd4 where gr1 = 3;
SELECT assert(count(*) = 25, 'Wrong number of samples on sampling with specified class sizes') from
out_cd4 where gr1 = 1;
--- Multiple class sizes with comma delimited string
DROP TABLE IF EXISTS out_cd5;
SELECT balance_sample('"TEST_s"', 'out_cd5', 'gr1', '2= 10, 3=6, 1 = 10', 100, NULL);
select gr1, count(*) from out_cd5 group by gr1;
SELECT assert(count(*) >= 100, 'Wrong number of samples on sampling with comma-delimited list') from out_cd5;
SELECT assert(count(*) = 10, 'Wrong number of samples on sampling with comma-delimited list') from
out_cd5 where gr1 = 2;
SELECT assert(count(*) = 25, 'Wrong number of samples on sampling with comma-delimited list') from
out_cd5 where gr1 = 4;
SELECT assert(count(*) = 10, 'Wrong number of samples on sampling with comma-delimited list') from
out_cd5 where gr1 = 1;
SELECT assert(count(*) = 25, 'Wrong number of samples on sampling with comma-delimited list') from
out_cd5 where gr1 = 5;
SELECT assert(count(*) = 6, 'Wrong number of samples on sampling with comma-delimited list') from
out_cd5 where gr1 = 3;